diff --git a/Makefile b/Makefile index 448ad0e..d31bfef 100644 --- a/Makefile +++ b/Makefile @@ -10,13 +10,9 @@ clean: poetry run jupyter-book clean tamingllms/ -convert: +convert-to-markdown: poetry run jupyter nbconvert --to markdown $(file) d2: - d2 -t 1 --sketch tamingllms/_static/safety/design.d2 tamingllms/_static/safety/design.svg - - -convert-latex: - jupyter nbconvert tamingllms/notebooks/structured_output.ipynb --to latex + d2 -t 1 --sketch $(file) $(output) diff --git a/README.md b/README.md index 93228a1..216b542 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Abstract: **The current discourse around Large Language Models (LLMs) tends to f | About the Book | | | [html](https://www.tamingllms.com/markdown/intro.html) | N/A | *Ready for Review* | | Chapter 1: The Evals Gap | [pdf](https://www.dropbox.com/scl/fi/voyhpqp0glkhijopyev71/DRAFT_Chapter-1-The-Evals-Gap.pdf?rlkey=ehzf6g4ngsssuoe471on8itu4&st=zqv98w2n&dl=0) | [podcast](https://tamingllm.substack.com/p/chapter-1-podcast-the-evals-gap) | [html](https://www.tamingllms.com/notebooks/evals.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/evals.ipynb) | *Ready for Review* | | Chapter 2: Structured Output| [pdf](https://www.dropbox.com/scl/fi/x3a84bm1ewcfemj4p7b5p/DRAFT_Chapter-2-Structured-Output.pdf?rlkey=zysw6mat7har133rs7am7bb8n&st=4ns4ak24&dl=0) | podcast | [html](https://www.tamingllms.com/notebooks/structured_output.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/structured_output.ipynb) | *Ready for Review* | -| Chapter 3: Managing Input Data | | | [html](https://www.tamingllms.com/notebooks/input.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/input.ipynb) | WIP | +| Chapter 3: Managing Input Data | | | [html](https://www.tamingllms.com/notebooks/input.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/input.ipynb) | *Ready for Review* | | Chapter 4: Safety | | | [html](https://www.tamingllms.com/notebooks/safety.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/safety.ipynb) | *Ready for Review* | | Chapter 5: Preference-Based Alignment | | | [html](https://www.tamingllms.com/notebooks/alignment.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/alignment.ipynb) | *Ready for Review* | | Chapter 6: Local LLMs in Practice | | | [html](https://www.tamingllms.com/notebooks/local.html) | [ipynb](https://github.com/souzatharsis/tamingLLMs/blob/master/tamingllms/notebooks/local.ipynb) | *Ready for Review* | diff --git a/TESTIMONIALS.md b/TESTIMONIALS.md new file mode 100644 index 0000000..bcbc103 --- /dev/null +++ b/TESTIMONIALS.md @@ -0,0 +1,5 @@ +> "I clicked on the link to quickly read the comparison result before going to bed. Ended up reading almost the whole website. Great resource, which covers everything I’ve learned in the past year and much more! Thank you!" +-- Julien Nahum, Founder of NotionForms, Ex-SDE at AWS + +> This is amazing content, thank you so much for sharing!!! +-- Didier Lopes, Founder of OpenBB diff --git a/meta2.png b/meta2.png new file mode 100644 index 0000000..d37f391 Binary files /dev/null and b/meta2.png differ diff --git a/meta2.svg b/meta2.svg new file mode 100644 index 0000000..889aada --- /dev/null +++ b/meta2.svg @@ -0,0 +1,894 @@ + + + + + + + + +LLM Judge Pairwise Evaluation SystemPool of LLM JudgesPairwiseSelectorllmcomparison_pairHumanEvaluatorsRankingAlgorithm
LLM Judges Leaderboard
+---------------------
+1. Judge C (0.95)
+2. Judge A (0.92)
+3. Judge B (0.89)
+   ...
+N. Judge X (0.75)
+
+
PromptLLM ResponseJudge AvsJudge B Draw Judges Generate Pair Input forEvaluation Evaluate Preferences GenerateRankings + + + + + + + + + + + + + + + + + + + + +
diff --git a/poetry.lock b/poetry.lock index 204962d..dd2f8ff 100644 --- a/poetry.lock +++ b/poetry.lock @@ -322,6 +322,20 @@ types-python-dateutil = ">=2.8.10" doc = ["doc8", "sphinx (>=7.0.0)", "sphinx-autobuild", "sphinx-autodoc-typehints", "sphinx_rtd_theme (>=1.3.0)"] test = ["dateparser (==1.*)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2021.1)", "simplejson (==3.*)"] +[[package]] +name = "asgiref" +version = "3.8.1" +description = "ASGI specs, helper code, and adapters" +optional = false +python-versions = ">=3.8" +files = [ + {file = "asgiref-3.8.1-py3-none-any.whl", hash = "sha256:3e1e3ecc849832fe52ccf2cb6686b7a55f82bb1d6aee72a58826471390335e47"}, + {file = "asgiref-3.8.1.tar.gz", hash = "sha256:c343bd80a0bec947a9860adb4c432ffa7db769836c64238fc34bdc3fec84d590"}, +] + +[package.extras] +tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] + [[package]] name = "asttokens" version = "2.4.1" @@ -387,6 +401,17 @@ files = [ [package.extras] dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] +[[package]] +name = "backoff" +version = "2.2.1" +description = "Function decoration for backoff and retry" +optional = false +python-versions = ">=3.7,<4.0" +files = [ + {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, + {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, +] + [[package]] name = "bc-detect-secrets" version = "1.5.15" @@ -407,6 +432,44 @@ unidiff = "*" gibberish = ["gibberish-detector"] word-list = ["pyahocorasick"] +[[package]] +name = "bcrypt" +version = "4.2.1" +description = "Modern password hashing for your software and your servers" +optional = false +python-versions = ">=3.7" +files = [ + {file = "bcrypt-4.2.1-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:1340411a0894b7d3ef562fb233e4b6ed58add185228650942bdc885362f32c17"}, + {file = "bcrypt-4.2.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1ee315739bc8387aa36ff127afc99120ee452924e0df517a8f3e4c0187a0f5f"}, + {file = "bcrypt-4.2.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dbd0747208912b1e4ce730c6725cb56c07ac734b3629b60d4398f082ea718ad"}, + {file = "bcrypt-4.2.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:aaa2e285be097050dba798d537b6efd9b698aa88eef52ec98d23dcd6d7cf6fea"}, + {file = "bcrypt-4.2.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:76d3e352b32f4eeb34703370e370997065d28a561e4a18afe4fef07249cb4396"}, + {file = "bcrypt-4.2.1-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7703ede632dc945ed1172d6f24e9f30f27b1b1a067f32f68bf169c5f08d0425"}, + {file = "bcrypt-4.2.1-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:89df2aea2c43be1e1fa066df5f86c8ce822ab70a30e4c210968669565c0f4685"}, + {file = "bcrypt-4.2.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:04e56e3fe8308a88b77e0afd20bec516f74aecf391cdd6e374f15cbed32783d6"}, + {file = "bcrypt-4.2.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:cfdf3d7530c790432046c40cda41dfee8c83e29482e6a604f8930b9930e94139"}, + {file = "bcrypt-4.2.1-cp37-abi3-win32.whl", hash = "sha256:adadd36274510a01f33e6dc08f5824b97c9580583bd4487c564fc4617b328005"}, + {file = "bcrypt-4.2.1-cp37-abi3-win_amd64.whl", hash = "sha256:8c458cd103e6c5d1d85cf600e546a639f234964d0228909d8f8dbeebff82d526"}, + {file = "bcrypt-4.2.1-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8ad2f4528cbf0febe80e5a3a57d7a74e6635e41af1ea5675282a33d769fba413"}, + {file = "bcrypt-4.2.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:909faa1027900f2252a9ca5dfebd25fc0ef1417943824783d1c8418dd7d6df4a"}, + {file = "bcrypt-4.2.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde78d385d5e93ece5479a0a87f73cd6fa26b171c786a884f955e165032b262c"}, + {file = "bcrypt-4.2.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:533e7f3bcf2f07caee7ad98124fab7499cb3333ba2274f7a36cf1daee7409d99"}, + {file = "bcrypt-4.2.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:687cf30e6681eeda39548a93ce9bfbb300e48b4d445a43db4298d2474d2a1e54"}, + {file = "bcrypt-4.2.1-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:041fa0155c9004eb98a232d54da05c0b41d4b8e66b6fc3cb71b4b3f6144ba837"}, + {file = "bcrypt-4.2.1-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f85b1ffa09240c89aa2e1ae9f3b1c687104f7b2b9d2098da4e923f1b7082d331"}, + {file = "bcrypt-4.2.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c6f5fa3775966cca251848d4d5393ab016b3afed251163c1436fefdec3b02c84"}, + {file = "bcrypt-4.2.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:807261df60a8b1ccd13e6599c779014a362ae4e795f5c59747f60208daddd96d"}, + {file = "bcrypt-4.2.1-cp39-abi3-win32.whl", hash = "sha256:b588af02b89d9fad33e5f98f7838bf590d6d692df7153647724a7f20c186f6bf"}, + {file = "bcrypt-4.2.1-cp39-abi3-win_amd64.whl", hash = "sha256:e84e0e6f8e40a242b11bce56c313edc2be121cec3e0ec2d76fce01f6af33c07c"}, + {file = "bcrypt-4.2.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:76132c176a6d9953cdc83c296aeaed65e1a708485fd55abf163e0d9f8f16ce0e"}, + {file = "bcrypt-4.2.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e158009a54c4c8bc91d5e0da80920d048f918c61a581f0a63e4e93bb556d362f"}, + {file = "bcrypt-4.2.1.tar.gz", hash = "sha256:6765386e3ab87f569b276988742039baab087b2cdb01e809d74e74503c2faafe"}, +] + +[package.extras] +tests = ["pytest (>=3.2.1,!=3.3.0)"] +typecheck = ["mypy"] + [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -493,6 +556,40 @@ files = [ [package.dependencies] numpy = ">=1.19.3,<3.0.0" +[[package]] +name = "build" +version = "1.2.2.post1" +description = "A simple, correct Python build frontend" +optional = false +python-versions = ">=3.8" +files = [ + {file = "build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5"}, + {file = "build-1.2.2.post1.tar.gz", hash = "sha256:b36993e92ca9375a219c99e606a122ff365a760a2d4bba0caa09bd5278b608b7"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "os_name == \"nt\""} +packaging = ">=19.1" +pyproject_hooks = "*" + +[package.extras] +docs = ["furo (>=2023.08.17)", "sphinx (>=7.0,<8.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)", "sphinx-issues (>=3.0.0)"] +test = ["build[uv,virtualenv]", "filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "setuptools (>=56.0.0)", "setuptools (>=67.8.0)", "wheel (>=0.36.0)"] +typing = ["build[uv]", "importlib-metadata (>=5.1)", "mypy (>=1.9.0,<1.10.0)", "tomli", "typing-extensions (>=3.7.4.3)"] +uv = ["uv (>=0.1.18)"] +virtualenv = ["virtualenv (>=20.0.35)"] + +[[package]] +name = "cachetools" +version = "5.5.0" +description = "Extensible memoizing collections and decorators" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, +] + [[package]] name = "catalogue" version = "2.0.10" @@ -708,6 +805,88 @@ files = [ {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] +[[package]] +name = "chroma-hnswlib" +version = "0.7.6" +description = "Chromas fork of hnswlib" +optional = false +python-versions = "*" +files = [ + {file = "chroma_hnswlib-0.7.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f35192fbbeadc8c0633f0a69c3d3e9f1a4eab3a46b65458bbcbcabdd9e895c36"}, + {file = "chroma_hnswlib-0.7.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6f007b608c96362b8f0c8b6b2ac94f67f83fcbabd857c378ae82007ec92f4d82"}, + {file = "chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:456fd88fa0d14e6b385358515aef69fc89b3c2191706fd9aee62087b62aad09c"}, + {file = "chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dfaae825499c2beaa3b75a12d7ec713b64226df72a5c4097203e3ed532680da"}, + {file = "chroma_hnswlib-0.7.6-cp310-cp310-win_amd64.whl", hash = "sha256:2487201982241fb1581be26524145092c95902cb09fc2646ccfbc407de3328ec"}, + {file = "chroma_hnswlib-0.7.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:81181d54a2b1e4727369486a631f977ffc53c5533d26e3d366dda243fb0998ca"}, + {file = "chroma_hnswlib-0.7.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4b4ab4e11f1083dd0a11ee4f0e0b183ca9f0f2ed63ededba1935b13ce2b3606f"}, + {file = "chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53db45cd9173d95b4b0bdccb4dbff4c54a42b51420599c32267f3abbeb795170"}, + {file = "chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c093f07a010b499c00a15bc9376036ee4800d335360570b14f7fe92badcdcf9"}, + {file = "chroma_hnswlib-0.7.6-cp311-cp311-win_amd64.whl", hash = "sha256:0540b0ac96e47d0aa39e88ea4714358ae05d64bbe6bf33c52f316c664190a6a3"}, + {file = "chroma_hnswlib-0.7.6-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e87e9b616c281bfbe748d01705817c71211613c3b063021f7ed5e47173556cb7"}, + {file = "chroma_hnswlib-0.7.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec5ca25bc7b66d2ecbf14502b5729cde25f70945d22f2aaf523c2d747ea68912"}, + {file = "chroma_hnswlib-0.7.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:305ae491de9d5f3c51e8bd52d84fdf2545a4a2bc7af49765cda286b7bb30b1d4"}, + {file = "chroma_hnswlib-0.7.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:822ede968d25a2c88823ca078a58f92c9b5c4142e38c7c8b4c48178894a0a3c5"}, + {file = "chroma_hnswlib-0.7.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2fe6ea949047beed19a94b33f41fe882a691e58b70c55fdaa90274ae78be046f"}, + {file = "chroma_hnswlib-0.7.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feceff971e2a2728c9ddd862a9dd6eb9f638377ad98438876c9aeac96c9482f5"}, + {file = "chroma_hnswlib-0.7.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb0633b60e00a2b92314d0bf5bbc0da3d3320be72c7e3f4a9b19f4609dc2b2ab"}, + {file = "chroma_hnswlib-0.7.6-cp37-cp37m-win_amd64.whl", hash = "sha256:a566abe32fab42291f766d667bdbfa234a7f457dcbd2ba19948b7a978c8ca624"}, + {file = "chroma_hnswlib-0.7.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6be47853d9a58dedcfa90fc846af202b071f028bbafe1d8711bf64fe5a7f6111"}, + {file = "chroma_hnswlib-0.7.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a7af35bdd39a88bffa49f9bb4bf4f9040b684514a024435a1ef5cdff980579d"}, + {file = "chroma_hnswlib-0.7.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a53b1f1551f2b5ad94eb610207bde1bb476245fc5097a2bec2b476c653c58bde"}, + {file = "chroma_hnswlib-0.7.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3085402958dbdc9ff5626ae58d696948e715aef88c86d1e3f9285a88f1afd3bc"}, + {file = "chroma_hnswlib-0.7.6-cp38-cp38-win_amd64.whl", hash = "sha256:77326f658a15adfb806a16543f7db7c45f06fd787d699e643642d6bde8ed49c4"}, + {file = "chroma_hnswlib-0.7.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:93b056ab4e25adab861dfef21e1d2a2756b18be5bc9c292aa252fa12bb44e6ae"}, + {file = "chroma_hnswlib-0.7.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fe91f018b30452c16c811fd6c8ede01f84e5a9f3c23e0758775e57f1c3778871"}, + {file = "chroma_hnswlib-0.7.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6c0e627476f0f4d9e153420d36042dd9c6c3671cfd1fe511c0253e38c2a1039"}, + {file = "chroma_hnswlib-0.7.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e9796a4536b7de6c6d76a792ba03e08f5aaa53e97e052709568e50b4d20c04f"}, + {file = "chroma_hnswlib-0.7.6-cp39-cp39-win_amd64.whl", hash = "sha256:d30e2db08e7ffdcc415bd072883a322de5995eb6ec28a8f8c054103bbd3ec1e0"}, + {file = "chroma_hnswlib-0.7.6.tar.gz", hash = "sha256:4dce282543039681160259d29fcde6151cc9106c6461e0485f57cdccd83059b7"}, +] + +[package.dependencies] +numpy = "*" + +[[package]] +name = "chromadb" +version = "0.6.1" +description = "Chroma." +optional = false +python-versions = ">=3.9" +files = [ + {file = "chromadb-0.6.1-py3-none-any.whl", hash = "sha256:3483ea9e1271b647f3696e1f39ee9e464bb23a6a9913f42c57a84657c34467bb"}, + {file = "chromadb-0.6.1.tar.gz", hash = "sha256:af55d143fd887f344ff05cd40560566dda1dd13e90ec5a13fb0f5278eb8cde75"}, +] + +[package.dependencies] +bcrypt = ">=4.0.1" +build = ">=1.0.3" +chroma-hnswlib = "0.7.6" +fastapi = ">=0.95.2" +grpcio = ">=1.58.0" +httpx = ">=0.27.0" +importlib-resources = "*" +kubernetes = ">=28.1.0" +mmh3 = ">=4.0.1" +numpy = ">=1.22.5" +onnxruntime = ">=1.14.1" +opentelemetry-api = ">=1.2.0" +opentelemetry-exporter-otlp-proto-grpc = ">=1.2.0" +opentelemetry-instrumentation-fastapi = ">=0.41b0" +opentelemetry-sdk = ">=1.2.0" +orjson = ">=3.9.12" +overrides = ">=7.3.1" +posthog = ">=2.4.0" +pydantic = ">=1.9" +pypika = ">=0.48.9" +PyYAML = ">=6.0.0" +rich = ">=10.11.0" +tenacity = ">=8.2.3" +tokenizers = ">=0.13.2" +tqdm = ">=4.65.0" +typer = ">=0.9.0" +typing_extensions = ">=4.5.0" +uvicorn = {version = ">=0.18.3", extras = ["standard"]} + [[package]] name = "click" version = "8.1.7" @@ -772,6 +951,23 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +description = "Colored terminal output for Python's logging module" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, + {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, +] + +[package.dependencies] +humanfriendly = ">=9.1" + +[package.extras] +cron = ["capturer (>=2.4)"] + [[package]] name = "comm" version = "0.2.2" @@ -1151,6 +1347,23 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "deprecated" +version = "1.2.15" +description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "Deprecated-1.2.15-py2.py3-none-any.whl", hash = "sha256:353bc4a8ac4bfc96800ddab349d89c25dec1079f65fd53acdcc1e0b975b21320"}, + {file = "deprecated-1.2.15.tar.gz", hash = "sha256:683e561a90de76239796e6b6feac66b99030d2dd3fcf61ef996330f14bbb9b0d"}, +] + +[package.dependencies] +wrapt = ">=1.10,<2" + +[package.extras] +dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "jinja2 (>=3.0.3,<3.1.0)", "setuptools", "sphinx (<2)", "tox"] + [[package]] name = "dill" version = "0.3.7" @@ -1359,6 +1572,17 @@ files = [ {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, ] +[[package]] +name = "durationpy" +version = "0.9" +description = "Module for converting between datetime.timedelta and Go's Duration strings." +optional = false +python-versions = "*" +files = [ + {file = "durationpy-0.9-py3-none-any.whl", hash = "sha256:e65359a7af5cedad07fb77a2dd3f390f8eb0b74cb845589fa6c057086834dd38"}, + {file = "durationpy-0.9.tar.gz", hash = "sha256:fd3feb0a69a0057d582ef643c355c40d2fa1c942191f914d12203b1a01ac722a"}, +] + [[package]] name = "easyocr" version = "1.7.2" @@ -1471,6 +1695,26 @@ files = [ [package.dependencies] python-dateutil = ">=2.4" +[[package]] +name = "fastapi" +version = "0.115.6" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastapi-0.115.6-py3-none-any.whl", hash = "sha256:e9240b29e36fa8f4bb7290316988e90c381e5092e0cbe84e7818cc3713bcf305"}, + {file = "fastapi-0.115.6.tar.gz", hash = "sha256:9ec46f7addc14ea472958a96aae5b5de65f39721a46aaf5705c480d9a8b76654"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" +starlette = ">=0.40.0,<0.42.0" +typing-extensions = ">=4.8.0" + +[package.extras] +all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"] + [[package]] name = "fastjsonschema" version = "2.20.0" @@ -1512,6 +1756,17 @@ files = [ {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, ] +[[package]] +name = "flatbuffers" +version = "24.12.23" +description = "The FlatBuffers serialization format for Python" +optional = false +python-versions = "*" +files = [ + {file = "flatbuffers-24.12.23-py2.py3-none-any.whl", hash = "sha256:c418e0d48890f4142b92fd3e343e73a48f194e1f80075ddcc5793779b3585444"}, + {file = "flatbuffers-24.12.23.tar.gz", hash = "sha256:2910b0bc6ae9b6db78dd2b18d0b7a0709ba240fb5585f286a3a2b30785c22dac"}, +] + [[package]] name = "fonttools" version = "4.55.0" @@ -1816,6 +2071,47 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] +[[package]] +name = "google-auth" +version = "2.37.0" +description = "Google Authentication Library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0"}, + {file = "google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00"}, +] + +[package.dependencies] +cachetools = ">=2.0.0,<6.0" +pyasn1-modules = ">=0.2.1" +rsa = ">=3.1.4,<5" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"] +enterprise-cert = ["cryptography", "pyopenssl"] +pyjwt = ["cryptography (>=38.0.3)", "pyjwt (>=2.0)"] +pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] +reauth = ["pyu2f (>=0.1.5)"] +requests = ["requests (>=2.20.0,<3.0.0.dev0)"] + +[[package]] +name = "googleapis-common-protos" +version = "1.66.0" +description = "Common protobufs used in Google APIs" +optional = false +python-versions = ">=3.7" +files = [ + {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, + {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, +] + +[package.dependencies] +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" + +[package.extras] +grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] + [[package]] name = "greenlet" version = "3.1.1" @@ -1902,6 +2198,73 @@ files = [ docs = ["Sphinx", "furo"] test = ["objgraph", "psutil"] +[[package]] +name = "grpcio" +version = "1.68.1" +description = "HTTP/2-based RPC framework" +optional = false +python-versions = ">=3.8" +files = [ + {file = "grpcio-1.68.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:d35740e3f45f60f3c37b1e6f2f4702c23867b9ce21c6410254c9c682237da68d"}, + {file = "grpcio-1.68.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:d99abcd61760ebb34bdff37e5a3ba333c5cc09feda8c1ad42547bea0416ada78"}, + {file = "grpcio-1.68.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:f8261fa2a5f679abeb2a0a93ad056d765cdca1c47745eda3f2d87f874ff4b8c9"}, + {file = "grpcio-1.68.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feb02205a27caca128627bd1df4ee7212db051019a9afa76f4bb6a1a80ca95e"}, + {file = "grpcio-1.68.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:919d7f18f63bcad3a0f81146188e90274fde800a94e35d42ffe9eadf6a9a6330"}, + {file = "grpcio-1.68.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:963cc8d7d79b12c56008aabd8b457f400952dbea8997dd185f155e2f228db079"}, + {file = "grpcio-1.68.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ccf2ebd2de2d6661e2520dae293298a3803a98ebfc099275f113ce1f6c2a80f1"}, + {file = "grpcio-1.68.1-cp310-cp310-win32.whl", hash = "sha256:2cc1fd04af8399971bcd4f43bd98c22d01029ea2e56e69c34daf2bf8470e47f5"}, + {file = "grpcio-1.68.1-cp310-cp310-win_amd64.whl", hash = "sha256:ee2e743e51cb964b4975de572aa8fb95b633f496f9fcb5e257893df3be854746"}, + {file = "grpcio-1.68.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:55857c71641064f01ff0541a1776bfe04a59db5558e82897d35a7793e525774c"}, + {file = "grpcio-1.68.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4b177f5547f1b995826ef529d2eef89cca2f830dd8b2c99ffd5fde4da734ba73"}, + {file = "grpcio-1.68.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:3522c77d7e6606d6665ec8d50e867f13f946a4e00c7df46768f1c85089eae515"}, + {file = "grpcio-1.68.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9d1fae6bbf0816415b81db1e82fb3bf56f7857273c84dcbe68cbe046e58e1ccd"}, + {file = "grpcio-1.68.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:298ee7f80e26f9483f0b6f94cc0a046caf54400a11b644713bb5b3d8eb387600"}, + {file = "grpcio-1.68.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cbb5780e2e740b6b4f2d208e90453591036ff80c02cc605fea1af8e6fc6b1bbe"}, + {file = "grpcio-1.68.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ddda1aa22495d8acd9dfbafff2866438d12faec4d024ebc2e656784d96328ad0"}, + {file = "grpcio-1.68.1-cp311-cp311-win32.whl", hash = "sha256:b33bd114fa5a83f03ec6b7b262ef9f5cac549d4126f1dc702078767b10c46ed9"}, + {file = "grpcio-1.68.1-cp311-cp311-win_amd64.whl", hash = "sha256:7f20ebec257af55694d8f993e162ddf0d36bd82d4e57f74b31c67b3c6d63d8b2"}, + {file = "grpcio-1.68.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:8829924fffb25386995a31998ccbbeaa7367223e647e0122043dfc485a87c666"}, + {file = "grpcio-1.68.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3aed6544e4d523cd6b3119b0916cef3d15ef2da51e088211e4d1eb91a6c7f4f1"}, + {file = "grpcio-1.68.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:4efac5481c696d5cb124ff1c119a78bddbfdd13fc499e3bc0ca81e95fc573684"}, + {file = "grpcio-1.68.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ab2d912ca39c51f46baf2a0d92aa265aa96b2443266fc50d234fa88bf877d8e"}, + {file = "grpcio-1.68.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c87ce2a97434dffe7327a4071839ab8e8bffd0054cc74cbe971fba98aedd60"}, + {file = "grpcio-1.68.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e4842e4872ae4ae0f5497bf60a0498fa778c192cc7a9e87877abd2814aca9475"}, + {file = "grpcio-1.68.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:255b1635b0ed81e9f91da4fcc8d43b7ea5520090b9a9ad9340d147066d1d3613"}, + {file = "grpcio-1.68.1-cp312-cp312-win32.whl", hash = "sha256:7dfc914cc31c906297b30463dde0b9be48e36939575eaf2a0a22a8096e69afe5"}, + {file = "grpcio-1.68.1-cp312-cp312-win_amd64.whl", hash = "sha256:a0c8ddabef9c8f41617f213e527254c41e8b96ea9d387c632af878d05db9229c"}, + {file = "grpcio-1.68.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:a47faedc9ea2e7a3b6569795c040aae5895a19dde0c728a48d3c5d7995fda385"}, + {file = "grpcio-1.68.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:390eee4225a661c5cd133c09f5da1ee3c84498dc265fd292a6912b65c421c78c"}, + {file = "grpcio-1.68.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:66a24f3d45c33550703f0abb8b656515b0ab777970fa275693a2f6dc8e35f1c1"}, + {file = "grpcio-1.68.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c08079b4934b0bf0a8847f42c197b1d12cba6495a3d43febd7e99ecd1cdc8d54"}, + {file = "grpcio-1.68.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8720c25cd9ac25dd04ee02b69256d0ce35bf8a0f29e20577427355272230965a"}, + {file = "grpcio-1.68.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:04cfd68bf4f38f5bb959ee2361a7546916bd9a50f78617a346b3aeb2b42e2161"}, + {file = "grpcio-1.68.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c28848761a6520c5c6071d2904a18d339a796ebe6b800adc8b3f474c5ce3c3ad"}, + {file = "grpcio-1.68.1-cp313-cp313-win32.whl", hash = "sha256:77d65165fc35cff6e954e7fd4229e05ec76102d4406d4576528d3a3635fc6172"}, + {file = "grpcio-1.68.1-cp313-cp313-win_amd64.whl", hash = "sha256:a8040f85dcb9830d8bbb033ae66d272614cec6faceee88d37a88a9bd1a7a704e"}, + {file = "grpcio-1.68.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:eeb38ff04ab6e5756a2aef6ad8d94e89bb4a51ef96e20f45c44ba190fa0bcaad"}, + {file = "grpcio-1.68.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8a3869a6661ec8f81d93f4597da50336718bde9eb13267a699ac7e0a1d6d0bea"}, + {file = "grpcio-1.68.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:2c4cec6177bf325eb6faa6bd834d2ff6aa8bb3b29012cceb4937b86f8b74323c"}, + {file = "grpcio-1.68.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12941d533f3cd45d46f202e3667be8ebf6bcb3573629c7ec12c3e211d99cfccf"}, + {file = "grpcio-1.68.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80af6f1e69c5e68a2be529990684abdd31ed6622e988bf18850075c81bb1ad6e"}, + {file = "grpcio-1.68.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e8dbe3e00771bfe3d04feed8210fc6617006d06d9a2679b74605b9fed3e8362c"}, + {file = "grpcio-1.68.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:83bbf5807dc3ee94ce1de2dfe8a356e1d74101e4b9d7aa8c720cc4818a34aded"}, + {file = "grpcio-1.68.1-cp38-cp38-win32.whl", hash = "sha256:8cb620037a2fd9eeee97b4531880e439ebfcd6d7d78f2e7dcc3726428ab5ef63"}, + {file = "grpcio-1.68.1-cp38-cp38-win_amd64.whl", hash = "sha256:52fbf85aa71263380d330f4fce9f013c0798242e31ede05fcee7fbe40ccfc20d"}, + {file = "grpcio-1.68.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:cb400138e73969eb5e0535d1d06cae6a6f7a15f2cc74add320e2130b8179211a"}, + {file = "grpcio-1.68.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a1b988b40f2fd9de5c820f3a701a43339d8dcf2cb2f1ca137e2c02671cc83ac1"}, + {file = "grpcio-1.68.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:96f473cdacfdd506008a5d7579c9f6a7ff245a9ade92c3c0265eb76cc591914f"}, + {file = "grpcio-1.68.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:37ea3be171f3cf3e7b7e412a98b77685eba9d4fd67421f4a34686a63a65d99f9"}, + {file = "grpcio-1.68.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ceb56c4285754e33bb3c2fa777d055e96e6932351a3082ce3559be47f8024f0"}, + {file = "grpcio-1.68.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:dffd29a2961f3263a16d73945b57cd44a8fd0b235740cb14056f0612329b345e"}, + {file = "grpcio-1.68.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:025f790c056815b3bf53da850dd70ebb849fd755a4b1ac822cb65cd631e37d43"}, + {file = "grpcio-1.68.1-cp39-cp39-win32.whl", hash = "sha256:1098f03dedc3b9810810568060dea4ac0822b4062f537b0f53aa015269be0a76"}, + {file = "grpcio-1.68.1-cp39-cp39-win_amd64.whl", hash = "sha256:334ab917792904245a028f10e803fcd5b6f36a7b2173a820c0b5b076555825e1"}, + {file = "grpcio-1.68.1.tar.gz", hash = "sha256:44a8502dd5de653ae6a73e2de50a401d84184f0331d0ac3daeb044e66d5c5054"}, +] + +[package.extras] +protobuf = ["grpcio-tools (>=1.68.1)"] + [[package]] name = "h11" version = "0.14.0" @@ -1934,6 +2297,61 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] trio = ["trio (>=0.22.0,<1.0)"] +[[package]] +name = "httptools" +version = "0.6.4" +description = "A collection of framework independent HTTP protocol utils." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "httptools-0.6.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c73ce323711a6ffb0d247dcd5a550b8babf0f757e86a52558fe5b86d6fefcc0"}, + {file = "httptools-0.6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345c288418f0944a6fe67be8e6afa9262b18c7626c3ef3c28adc5eabc06a68da"}, + {file = "httptools-0.6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deee0e3343f98ee8047e9f4c5bc7cedbf69f5734454a94c38ee829fb2d5fa3c1"}, + {file = "httptools-0.6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca80b7485c76f768a3bc83ea58373f8db7b015551117375e4918e2aa77ea9b50"}, + {file = "httptools-0.6.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:90d96a385fa941283ebd231464045187a31ad932ebfa541be8edf5b3c2328959"}, + {file = "httptools-0.6.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:59e724f8b332319e2875efd360e61ac07f33b492889284a3e05e6d13746876f4"}, + {file = "httptools-0.6.4-cp310-cp310-win_amd64.whl", hash = "sha256:c26f313951f6e26147833fc923f78f95604bbec812a43e5ee37f26dc9e5a686c"}, + {file = "httptools-0.6.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f47f8ed67cc0ff862b84a1189831d1d33c963fb3ce1ee0c65d3b0cbe7b711069"}, + {file = "httptools-0.6.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0614154d5454c21b6410fdf5262b4a3ddb0f53f1e1721cfd59d55f32138c578a"}, + {file = "httptools-0.6.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8787367fbdfccae38e35abf7641dafc5310310a5987b689f4c32cc8cc3ee975"}, + {file = "httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b0f7fe4fd38e6a507bdb751db0379df1e99120c65fbdc8ee6c1d044897a636"}, + {file = "httptools-0.6.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40a5ec98d3f49904b9fe36827dcf1aadfef3b89e2bd05b0e35e94f97c2b14721"}, + {file = "httptools-0.6.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dacdd3d10ea1b4ca9df97a0a303cbacafc04b5cd375fa98732678151643d4988"}, + {file = "httptools-0.6.4-cp311-cp311-win_amd64.whl", hash = "sha256:288cd628406cc53f9a541cfaf06041b4c71d751856bab45e3702191f931ccd17"}, + {file = "httptools-0.6.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:df017d6c780287d5c80601dafa31f17bddb170232d85c066604d8558683711a2"}, + {file = "httptools-0.6.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:85071a1e8c2d051b507161f6c3e26155b5c790e4e28d7f236422dbacc2a9cc44"}, + {file = "httptools-0.6.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69422b7f458c5af875922cdb5bd586cc1f1033295aa9ff63ee196a87519ac8e1"}, + {file = "httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16e603a3bff50db08cd578d54f07032ca1631450ceb972c2f834c2b860c28ea2"}, + {file = "httptools-0.6.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec4f178901fa1834d4a060320d2f3abc5c9e39766953d038f1458cb885f47e81"}, + {file = "httptools-0.6.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb89ecf8b290f2e293325c646a211ff1c2493222798bb80a530c5e7502494f"}, + {file = "httptools-0.6.4-cp312-cp312-win_amd64.whl", hash = "sha256:db78cb9ca56b59b016e64b6031eda5653be0589dba2b1b43453f6e8b405a0970"}, + {file = "httptools-0.6.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ade273d7e767d5fae13fa637f4d53b6e961fb7fd93c7797562663f0171c26660"}, + {file = "httptools-0.6.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:856f4bc0478ae143bad54a4242fccb1f3f86a6e1be5548fecfd4102061b3a083"}, + {file = "httptools-0.6.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:322d20ea9cdd1fa98bd6a74b77e2ec5b818abdc3d36695ab402a0de8ef2865a3"}, + {file = "httptools-0.6.4-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d87b29bd4486c0093fc64dea80231f7c7f7eb4dc70ae394d70a495ab8436071"}, + {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:342dd6946aa6bda4b8f18c734576106b8a31f2fe31492881a9a160ec84ff4bd5"}, + {file = "httptools-0.6.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b36913ba52008249223042dca46e69967985fb4051951f94357ea681e1f5dc0"}, + {file = "httptools-0.6.4-cp313-cp313-win_amd64.whl", hash = "sha256:28908df1b9bb8187393d5b5db91435ccc9c8e891657f9cbb42a2541b44c82fc8"}, + {file = "httptools-0.6.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:d3f0d369e7ffbe59c4b6116a44d6a8eb4783aae027f2c0b366cf0aa964185dba"}, + {file = "httptools-0.6.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:94978a49b8f4569ad607cd4946b759d90b285e39c0d4640c6b36ca7a3ddf2efc"}, + {file = "httptools-0.6.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40dc6a8e399e15ea525305a2ddba998b0af5caa2566bcd79dcbe8948181eeaff"}, + {file = "httptools-0.6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab9ba8dcf59de5181f6be44a77458e45a578fc99c31510b8c65b7d5acc3cf490"}, + {file = "httptools-0.6.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fc411e1c0a7dcd2f902c7c48cf079947a7e65b5485dea9decb82b9105ca71a43"}, + {file = "httptools-0.6.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d54efd20338ac52ba31e7da78e4a72570cf729fac82bc31ff9199bedf1dc7440"}, + {file = "httptools-0.6.4-cp38-cp38-win_amd64.whl", hash = "sha256:df959752a0c2748a65ab5387d08287abf6779ae9165916fe053e68ae1fbdc47f"}, + {file = "httptools-0.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85797e37e8eeaa5439d33e556662cc370e474445d5fab24dcadc65a8ffb04003"}, + {file = "httptools-0.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:db353d22843cf1028f43c3651581e4bb49374d85692a85f95f7b9a130e1b2cab"}, + {file = "httptools-0.6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1ffd262a73d7c28424252381a5b854c19d9de5f56f075445d33919a637e3547"}, + {file = "httptools-0.6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:703c346571fa50d2e9856a37d7cd9435a25e7fd15e236c397bf224afaa355fe9"}, + {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:aafe0f1918ed07b67c1e838f950b1c1fabc683030477e60b335649b8020e1076"}, + {file = "httptools-0.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0e563e54979e97b6d13f1bbc05a96109923e76b901f786a5eae36e99c01237bd"}, + {file = "httptools-0.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:b799de31416ecc589ad79dd85a0b2657a8fe39327944998dea368c1d4c9e55e6"}, + {file = "httptools-0.6.4.tar.gz", hash = "sha256:4e93eee4add6493b59a5c514da98c939b244fce4a0d8879cd3f466562f4b7d5c"}, +] + +[package.extras] +test = ["Cython (>=0.29.24)"] + [[package]] name = "httpx" version = "0.27.2" @@ -1993,6 +2411,20 @@ testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gr torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] +[[package]] +name = "humanfriendly" +version = "10.0" +description = "Human friendly output for text interfaces using Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, + {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, +] + +[package.dependencies] +pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} + [[package]] name = "idna" version = "3.10" @@ -2074,6 +2506,25 @@ perf = ["ipython"] test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] type = ["pytest-mypy"] +[[package]] +name = "importlib-resources" +version = "6.5.2" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.9" +files = [ + {file = "importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec"}, + {file = "importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] +type = ["pytest-mypy"] + [[package]] name = "insipid-sphinx-theme" version = "0.4.2" @@ -2784,6 +3235,33 @@ files = [ {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"}, ] +[[package]] +name = "kubernetes" +version = "31.0.0" +description = "Kubernetes python client" +optional = false +python-versions = ">=3.6" +files = [ + {file = "kubernetes-31.0.0-py2.py3-none-any.whl", hash = "sha256:bf141e2d380c8520eada8b351f4e319ffee9636328c137aa432bc486ca1200e1"}, + {file = "kubernetes-31.0.0.tar.gz", hash = "sha256:28945de906c8c259c1ebe62703b56a03b714049372196f854105afe4e6d014c0"}, +] + +[package.dependencies] +certifi = ">=14.05.14" +durationpy = ">=0.7" +google-auth = ">=1.0.1" +oauthlib = ">=3.2.2" +python-dateutil = ">=2.5.3" +pyyaml = ">=5.4.1" +requests = "*" +requests-oauthlib = "*" +six = ">=1.9.0" +urllib3 = ">=1.24.2" +websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" + +[package.extras] +adal = ["adal (>=1.0.2)"] + [[package]] name = "langcodes" version = "3.5.0" @@ -3657,6 +4135,130 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] +[[package]] +name = "mmh3" +version = "5.0.1" +description = "Python extension for MurmurHash (MurmurHash3), a set of fast and robust hash functions." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mmh3-5.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f0a4b4bf05778ed77d820d6e7d0e9bd6beb0c01af10e1ce9233f5d2f814fcafa"}, + {file = "mmh3-5.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac7a391039aeab95810c2d020b69a94eb6b4b37d4e2374831e92db3a0cdf71c6"}, + {file = "mmh3-5.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a2583b5521ca49756d8d8bceba80627a9cc295f255dcab4e3df7ccc2f09679a"}, + {file = "mmh3-5.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:081a8423fe53c1ac94f87165f3e4c500125d343410c1a0c5f1703e898a3ef038"}, + {file = "mmh3-5.0.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8b4d72713799755dc8954a7d36d5c20a6c8de7b233c82404d122c7c7c1707cc"}, + {file = "mmh3-5.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:389a6fd51efc76d3182d36ec306448559c1244f11227d2bb771bdd0e6cc91321"}, + {file = "mmh3-5.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39f4128edaa074bff721b1d31a72508cba4d2887ee7867f22082e1fe9d4edea0"}, + {file = "mmh3-5.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d5d23a94d91aabba3386b3769048d5f4210fdfef80393fece2f34ba5a7b466c"}, + {file = "mmh3-5.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:16347d038361f8b8f24fd2b7ef378c9b68ddee9f7706e46269b6e0d322814713"}, + {file = "mmh3-5.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6e299408565af7d61f2d20a5ffdd77cf2ed902460fe4e6726839d59ba4b72316"}, + {file = "mmh3-5.0.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:42050af21ddfc5445ee5a66e73a8fc758c71790305e3ee9e4a85a8e69e810f94"}, + {file = "mmh3-5.0.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2ae9b1f5ef27ec54659920f0404b7ceb39966e28867c461bfe83a05e8d18ddb0"}, + {file = "mmh3-5.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:50c2495a02045f3047d71d4ae9cdd7a15efc0bcbb7ff17a18346834a8e2d1d19"}, + {file = "mmh3-5.0.1-cp310-cp310-win32.whl", hash = "sha256:c028fa77cddf351ca13b4a56d43c1775652cde0764cadb39120b68f02a23ecf6"}, + {file = "mmh3-5.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c5e741e421ec14400c4aae30890515c201f518403bdef29ae1e00d375bb4bbb5"}, + {file = "mmh3-5.0.1-cp310-cp310-win_arm64.whl", hash = "sha256:b17156d56fabc73dbf41bca677ceb6faed435cc8544f6566d72ea77d8a17e9d0"}, + {file = "mmh3-5.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a6d5a9b1b923f1643559ba1fc0bf7a5076c90cbb558878d3bf3641ce458f25d"}, + {file = "mmh3-5.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3349b968be555f7334bbcce839da98f50e1e80b1c615d8e2aa847ea4a964a012"}, + {file = "mmh3-5.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1bd3c94b110e55db02ab9b605029f48a2f7f677c6e58c09d44e42402d438b7e1"}, + {file = "mmh3-5.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47ba84d48608f79adbb10bb09986b6dc33eeda5c2d1bd75d00820081b73bde9"}, + {file = "mmh3-5.0.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0217987a8b8525c8d9170f66d036dec4ab45cfbd53d47e8d76125791ceb155e"}, + {file = "mmh3-5.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2797063a34e78d1b61639a98b0edec1c856fa86ab80c7ec859f1796d10ba429"}, + {file = "mmh3-5.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8bba16340adcbd47853a2fbe5afdb397549e8f2e79324ff1dced69a3f8afe7c3"}, + {file = "mmh3-5.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:282797957c9f60b51b9d768a602c25f579420cc9af46feb77d457a27823d270a"}, + {file = "mmh3-5.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e4fb670c29e63f954f9e7a2cdcd57b36a854c2538f579ef62681ccbaa1de2b69"}, + {file = "mmh3-5.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ee7d85438dc6aff328e19ab052086a3c29e8a9b632998a49e5c4b0034e9e8d6"}, + {file = "mmh3-5.0.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b7fb5db231f3092444bc13901e6a8d299667126b00636ffbad4a7b45e1051e2f"}, + {file = "mmh3-5.0.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c100dd441703da5ec136b1d9003ed4a041d8a1136234c9acd887499796df6ad8"}, + {file = "mmh3-5.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:71f3b765138260fd7a7a2dba0ea5727dabcd18c1f80323c9cfef97a7e86e01d0"}, + {file = "mmh3-5.0.1-cp311-cp311-win32.whl", hash = "sha256:9a76518336247fd17689ce3ae5b16883fd86a490947d46a0193d47fb913e26e3"}, + {file = "mmh3-5.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:336bc4df2e44271f1c302d289cc3d78bd52d3eed8d306c7e4bff8361a12bf148"}, + {file = "mmh3-5.0.1-cp311-cp311-win_arm64.whl", hash = "sha256:af6522722fbbc5999aa66f7244d0986767a46f1fb05accc5200f75b72428a508"}, + {file = "mmh3-5.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f2730bb263ed9c388e8860438b057a53e3cc701134a6ea140f90443c4c11aa40"}, + {file = "mmh3-5.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6246927bc293f6d56724536400b85fb85f5be26101fa77d5f97dd5e2a4c69bf2"}, + {file = "mmh3-5.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fbca322519a6e6e25b6abf43e940e1667cf8ea12510e07fb4919b48a0cd1c411"}, + {file = "mmh3-5.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eae8c19903ed8a1724ad9e67e86f15d198a7a1271a4f9be83d47e38f312ed672"}, + {file = "mmh3-5.0.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a09fd6cc72c07c0c07c3357714234b646d78052487c4a3bd5f7f6e08408cff60"}, + {file = "mmh3-5.0.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2ff8551fee7ae3b11c5d986b6347ade0dccaadd4670ffdb2b944dee120ffcc84"}, + {file = "mmh3-5.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e39694c73a5a20c8bf36dfd8676ed351e5234d55751ba4f7562d85449b21ef3f"}, + {file = "mmh3-5.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eba6001989a92f72a89c7cf382fda831678bd780707a66b4f8ca90239fdf2123"}, + {file = "mmh3-5.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0771f90c9911811cc606a5c7b7b58f33501c9ee896ed68a6ac22c7d55878ecc0"}, + {file = "mmh3-5.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:09b31ed0c0c0920363e96641fac4efde65b1ab62b8df86293142f35a254e72b4"}, + {file = "mmh3-5.0.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5cf4a8deda0235312db12075331cb417c4ba163770edfe789bde71d08a24b692"}, + {file = "mmh3-5.0.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:41f7090a95185ef20ac018581a99337f0cbc84a2135171ee3290a9c0d9519585"}, + {file = "mmh3-5.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b97b5b368fb7ff22194ec5854f5b12d8de9ab67a0f304728c7f16e5d12135b76"}, + {file = "mmh3-5.0.1-cp312-cp312-win32.whl", hash = "sha256:842516acf04da546f94fad52db125ee619ccbdcada179da51c326a22c4578cb9"}, + {file = "mmh3-5.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:d963be0dbfd9fca209c17172f6110787ebf78934af25e3694fe2ba40e55c1e2b"}, + {file = "mmh3-5.0.1-cp312-cp312-win_arm64.whl", hash = "sha256:a5da292ceeed8ce8e32b68847261a462d30fd7b478c3f55daae841404f433c15"}, + {file = "mmh3-5.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:673e3f1c8d4231d6fb0271484ee34cb7146a6499fc0df80788adb56fd76842da"}, + {file = "mmh3-5.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f795a306bd16a52ad578b663462cc8e95500b3925d64118ae63453485d67282b"}, + {file = "mmh3-5.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5ed57a5e28e502a1d60436cc25c76c3a5ba57545f250f2969af231dc1221e0a5"}, + {file = "mmh3-5.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:632c28e7612e909dbb6cbe2fe496201ada4695b7715584005689c5dc038e59ad"}, + {file = "mmh3-5.0.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:53fd6bd525a5985e391c43384672d9d6b317fcb36726447347c7fc75bfed34ec"}, + {file = "mmh3-5.0.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dceacf6b0b961a0e499836af3aa62d60633265607aef551b2a3e3c48cdaa5edd"}, + {file = "mmh3-5.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f0738d478fdfb5d920f6aff5452c78f2c35b0eff72caa2a97dfe38e82f93da2"}, + {file = "mmh3-5.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e70285e7391ab88b872e5bef632bad16b9d99a6d3ca0590656a4753d55988af"}, + {file = "mmh3-5.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:27e5fc6360aa6b828546a4318da1a7da6bf6e5474ccb053c3a6aa8ef19ff97bd"}, + {file = "mmh3-5.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7989530c3c1e2c17bf5a0ec2bba09fd19819078ba90beedabb1c3885f5040b0d"}, + {file = "mmh3-5.0.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:cdad7bee649950da7ecd3cbbbd12fb81f1161072ecbdb5acfa0018338c5cb9cf"}, + {file = "mmh3-5.0.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e143b8f184c1bb58cecd85ab4a4fd6dc65a2d71aee74157392c3fddac2a4a331"}, + {file = "mmh3-5.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e5eb12e886f3646dd636f16b76eb23fc0c27e8ff3c1ae73d4391e50ef60b40f6"}, + {file = "mmh3-5.0.1-cp313-cp313-win32.whl", hash = "sha256:16e6dddfa98e1c2d021268e72c78951234186deb4df6630e984ac82df63d0a5d"}, + {file = "mmh3-5.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d3ffb792d70b8c4a2382af3598dad6ae0c5bd9cee5b7ffcc99aa2f5fd2c1bf70"}, + {file = "mmh3-5.0.1-cp313-cp313-win_arm64.whl", hash = "sha256:122fa9ec148383f9124292962bda745f192b47bfd470b2af5fe7bb3982b17896"}, + {file = "mmh3-5.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b12bad8c75e6ff5d67319794fb6a5e8c713826c818d47f850ad08b4aa06960c6"}, + {file = "mmh3-5.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e5bbb066538c1048d542246fc347bb7994bdda29a3aea61c22f9f8b57111ce69"}, + {file = "mmh3-5.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eee6134273f64e2a106827cc8fd77e70cc7239a285006fc6ab4977d59b015af2"}, + {file = "mmh3-5.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d04d9aa19d48e4c7bbec9cabc2c4dccc6ff3b2402f856d5bf0de03e10f167b5b"}, + {file = "mmh3-5.0.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79f37da1eed034d06567a69a7988456345c7f29e49192831c3975b464493b16e"}, + {file = "mmh3-5.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:242f77666743337aa828a2bf2da71b6ba79623ee7f93edb11e009f69237c8561"}, + {file = "mmh3-5.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffd943fff690463945f6441a2465555b3146deaadf6a5e88f2590d14c655d71b"}, + {file = "mmh3-5.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565b15f8d7df43acb791ff5a360795c20bfa68bca8b352509e0fbabd06cc48cd"}, + {file = "mmh3-5.0.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:fc6aafb867c2030df98ac7760ff76b500359252867985f357bd387739f3d5287"}, + {file = "mmh3-5.0.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:32898170644d45aa27c974ab0d067809c066205110f5c6d09f47d9ece6978bfe"}, + {file = "mmh3-5.0.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:42865567838d2193eb64e0ef571f678bf361a254fcdef0c5c8e73243217829bd"}, + {file = "mmh3-5.0.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:5ff5c1f301c4a8b6916498969c0fcc7e3dbc56b4bfce5cfe3fe31f3f4609e5ae"}, + {file = "mmh3-5.0.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:be74c2dda8a6f44a504450aa2c3507f8067a159201586fc01dd41ab80efc350f"}, + {file = "mmh3-5.0.1-cp38-cp38-win32.whl", hash = "sha256:5610a842621ff76c04b20b29cf5f809b131f241a19d4937971ba77dc99a7f330"}, + {file = "mmh3-5.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:de15739ac50776fe8aa1ef13f1be46a6ee1fbd45f6d0651084097eb2be0a5aa4"}, + {file = "mmh3-5.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:48e84cf3cc7e8c41bc07de72299a73b92d9e3cde51d97851420055b1484995f7"}, + {file = "mmh3-5.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6dd9dc28c2d168c49928195c2e29b96f9582a5d07bd690a28aede4cc07b0e696"}, + {file = "mmh3-5.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2771a1c56a3d4bdad990309cff5d0a8051f29c8ec752d001f97d6392194ae880"}, + {file = "mmh3-5.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5ff2a8322ba40951a84411550352fba1073ce1c1d1213bb7530f09aed7f8caf"}, + {file = "mmh3-5.0.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a16bd3ec90682c9e0a343e6bd4c778c09947c8c5395cdb9e5d9b82b2559efbca"}, + {file = "mmh3-5.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d45733a78d68b5b05ff4a823aea51fa664df1d3bf4929b152ff4fd6dea2dd69b"}, + {file = "mmh3-5.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:904285e83cedebc8873b0838ed54c20f7344120be26e2ca5a907ab007a18a7a0"}, + {file = "mmh3-5.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac4aeb1784e43df728034d0ed72e4b2648db1a69fef48fa58e810e13230ae5ff"}, + {file = "mmh3-5.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:cb3d4f751a0b8b4c8d06ef1c085216c8fddcc8b8c8d72445976b5167a40c6d1e"}, + {file = "mmh3-5.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8021851935600e60c42122ed1176399d7692df338d606195cd599d228a04c1c6"}, + {file = "mmh3-5.0.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6182d5924a5efc451900f864cbb021d7e8ad5d524816ca17304a0f663bc09bb5"}, + {file = "mmh3-5.0.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:5f30b834552a4f79c92e3d266336fb87fd92ce1d36dc6813d3e151035890abbd"}, + {file = "mmh3-5.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cd4383f35e915e06d077df27e04ffd3be7513ec6a9de2d31f430393f67e192a7"}, + {file = "mmh3-5.0.1-cp39-cp39-win32.whl", hash = "sha256:1455fb6b42665a97db8fc66e89a861e52b567bce27ed054c47877183f86ea6e3"}, + {file = "mmh3-5.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:9e26a0f4eb9855a143f5938a53592fa14c2d3b25801c2106886ab6c173982780"}, + {file = "mmh3-5.0.1-cp39-cp39-win_arm64.whl", hash = "sha256:0d0a35a69abdad7549c4030a714bb4ad07902edb3bbe61e1bbc403ded5d678be"}, + {file = "mmh3-5.0.1.tar.gz", hash = "sha256:7dab080061aeb31a6069a181f27c473a1f67933854e36a3464931f2716508896"}, +] + +[package.extras] +benchmark = ["pymmh3 (==0.0.5)", "pyperf (==2.7.0)", "xxhash (==3.5.0)"] +docs = ["myst-parser (==4.0.0)", "shibuya (==2024.8.30)", "sphinx (==8.0.2)", "sphinx-copybutton (==0.5.2)"] +lint = ["black (==24.8.0)", "clang-format (==18.1.8)", "isort (==5.13.2)", "pylint (==3.2.7)"] +plot = ["matplotlib (==3.9.2)", "pandas (==2.2.2)"] +test = ["pytest (==8.3.3)", "pytest-sugar (==1.0.0)"] +type = ["mypy (==1.11.2)"] + +[[package]] +name = "monotonic" +version = "1.6" +description = "An implementation of time.monotonic() for Python 2 & < 3.3" +optional = false +python-versions = "*" +files = [ + {file = "monotonic-1.6-py2.py3-none-any.whl", hash = "sha256:68687e19a14f11f26d140dd5c86f3dba4bf5df58003000ed467e0e2a69bca96c"}, + {file = "monotonic-1.6.tar.gz", hash = "sha256:3a55207bcfed53ddd5c5bae174524062935efed17792e9de2ad0205ce9ad63f7"}, +] + [[package]] name = "mpire" version = "2.10.2" @@ -4389,6 +4991,22 @@ files = [ {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"}, ] +[[package]] +name = "oauthlib" +version = "3.2.2" +description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +optional = false +python-versions = ">=3.6" +files = [ + {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"}, + {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, +] + +[package.extras] +rsa = ["cryptography (>=3.0.0)"] +signals = ["blinker (>=1.4.0)"] +signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] + [[package]] name = "oldest-supported-numpy" version = "2022.1.30" @@ -4460,6 +5078,44 @@ files = [ httpx = ">=0.27.0,<0.28.0" pydantic = ">=2.9.0,<3.0.0" +[[package]] +name = "onnxruntime" +version = "1.20.1" +description = "ONNX Runtime is a runtime accelerator for Machine Learning models" +optional = false +python-versions = "*" +files = [ + {file = "onnxruntime-1.20.1-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:e50ba5ff7fed4f7d9253a6baf801ca2883cc08491f9d32d78a80da57256a5439"}, + {file = "onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de"}, + {file = "onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410"}, + {file = "onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f"}, + {file = "onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2"}, + {file = "onnxruntime-1.20.1-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:06bfbf02ca9ab5f28946e0f912a562a5f005301d0c419283dc57b3ed7969bb7b"}, + {file = "onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7"}, + {file = "onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc"}, + {file = "onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41"}, + {file = "onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221"}, + {file = "onnxruntime-1.20.1-cp312-cp312-macosx_13_0_universal2.whl", hash = "sha256:22b0655e2bf4f2161d52706e31f517a0e54939dc393e92577df51808a7edc8c9"}, + {file = "onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172"}, + {file = "onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e"}, + {file = "onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120"}, + {file = "onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb"}, + {file = "onnxruntime-1.20.1-cp313-cp313-macosx_13_0_universal2.whl", hash = "sha256:cc01437a32d0042b606f462245c8bbae269e5442797f6213e36ce61d5abdd8cc"}, + {file = "onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be"}, + {file = "onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3"}, + {file = "onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16"}, + {file = "onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8"}, + {file = "onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b"}, +] + +[package.dependencies] +coloredlogs = "*" +flatbuffers = "*" +numpy = ">=1.21.6" +packaging = "*" +protobuf = "*" +sympy = "*" + [[package]] name = "openai" version = "1.54.4" @@ -4520,6 +5176,170 @@ files = [ [package.dependencies] et-xmlfile = "*" +[[package]] +name = "opentelemetry-api" +version = "1.29.0" +description = "OpenTelemetry Python API" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_api-1.29.0-py3-none-any.whl", hash = "sha256:5fcd94c4141cc49c736271f3e1efb777bebe9cc535759c54c936cca4f1b312b8"}, + {file = "opentelemetry_api-1.29.0.tar.gz", hash = "sha256:d04a6cf78aad09614f52964ecb38021e248f5714dc32c2e0d8fd99517b4d69cf"}, +] + +[package.dependencies] +deprecated = ">=1.2.6" +importlib-metadata = ">=6.0,<=8.5.0" + +[[package]] +name = "opentelemetry-exporter-otlp-proto-common" +version = "1.29.0" +description = "OpenTelemetry Protobuf encoding" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl", hash = "sha256:a9d7376c06b4da9cf350677bcddb9618ed4b8255c3f6476975f5e38274ecd3aa"}, + {file = "opentelemetry_exporter_otlp_proto_common-1.29.0.tar.gz", hash = "sha256:e7c39b5dbd1b78fe199e40ddfe477e6983cb61aa74ba836df09c3869a3e3e163"}, +] + +[package.dependencies] +opentelemetry-proto = "1.29.0" + +[[package]] +name = "opentelemetry-exporter-otlp-proto-grpc" +version = "1.29.0" +description = "OpenTelemetry Collector Protobuf over gRPC Exporter" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl", hash = "sha256:5a2a3a741a2543ed162676cf3eefc2b4150e6f4f0a193187afb0d0e65039c69c"}, + {file = "opentelemetry_exporter_otlp_proto_grpc-1.29.0.tar.gz", hash = "sha256:3d324d07d64574d72ed178698de3d717f62a059a93b6b7685ee3e303384e73ea"}, +] + +[package.dependencies] +deprecated = ">=1.2.6" +googleapis-common-protos = ">=1.52,<2.0" +grpcio = ">=1.63.2,<2.0.0" +opentelemetry-api = ">=1.15,<2.0" +opentelemetry-exporter-otlp-proto-common = "1.29.0" +opentelemetry-proto = "1.29.0" +opentelemetry-sdk = ">=1.29.0,<1.30.0" + +[[package]] +name = "opentelemetry-instrumentation" +version = "0.50b0" +description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_instrumentation-0.50b0-py3-none-any.whl", hash = "sha256:b8f9fc8812de36e1c6dffa5bfc6224df258841fb387b6dfe5df15099daa10630"}, + {file = "opentelemetry_instrumentation-0.50b0.tar.gz", hash = "sha256:7d98af72de8dec5323e5202e46122e5f908592b22c6d24733aad619f07d82979"}, +] + +[package.dependencies] +opentelemetry-api = ">=1.4,<2.0" +opentelemetry-semantic-conventions = "0.50b0" +packaging = ">=18.0" +wrapt = ">=1.0.0,<2.0.0" + +[[package]] +name = "opentelemetry-instrumentation-asgi" +version = "0.50b0" +description = "ASGI instrumentation for OpenTelemetry" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_instrumentation_asgi-0.50b0-py3-none-any.whl", hash = "sha256:2ba1297f746e55dec5a17fe825689da0613662fb25c004c3965a6c54b1d5be22"}, + {file = "opentelemetry_instrumentation_asgi-0.50b0.tar.gz", hash = "sha256:3ca4cb5616ae6a3e8ce86e7d5c360a8d8cc8ed722cf3dc8a5e44300774e87d49"}, +] + +[package.dependencies] +asgiref = ">=3.0,<4.0" +opentelemetry-api = ">=1.12,<2.0" +opentelemetry-instrumentation = "0.50b0" +opentelemetry-semantic-conventions = "0.50b0" +opentelemetry-util-http = "0.50b0" + +[package.extras] +instruments = ["asgiref (>=3.0,<4.0)"] + +[[package]] +name = "opentelemetry-instrumentation-fastapi" +version = "0.50b0" +description = "OpenTelemetry FastAPI Instrumentation" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_instrumentation_fastapi-0.50b0-py3-none-any.whl", hash = "sha256:8f03b738495e4705fbae51a2826389c7369629dace89d0f291c06ffefdff5e52"}, + {file = "opentelemetry_instrumentation_fastapi-0.50b0.tar.gz", hash = "sha256:16b9181682136da210295def2bb304a32fb9bdee9a935cdc9da43567f7c1149e"}, +] + +[package.dependencies] +opentelemetry-api = ">=1.12,<2.0" +opentelemetry-instrumentation = "0.50b0" +opentelemetry-instrumentation-asgi = "0.50b0" +opentelemetry-semantic-conventions = "0.50b0" +opentelemetry-util-http = "0.50b0" + +[package.extras] +instruments = ["fastapi (>=0.58,<1.0)"] + +[[package]] +name = "opentelemetry-proto" +version = "1.29.0" +description = "OpenTelemetry Python Proto" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_proto-1.29.0-py3-none-any.whl", hash = "sha256:495069c6f5495cbf732501cdcd3b7f60fda2b9d3d4255706ca99b7ca8dec53ff"}, + {file = "opentelemetry_proto-1.29.0.tar.gz", hash = "sha256:3c136aa293782e9b44978c738fff72877a4b78b5d21a64e879898db7b2d93e5d"}, +] + +[package.dependencies] +protobuf = ">=5.0,<6.0" + +[[package]] +name = "opentelemetry-sdk" +version = "1.29.0" +description = "OpenTelemetry Python SDK" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_sdk-1.29.0-py3-none-any.whl", hash = "sha256:173be3b5d3f8f7d671f20ea37056710217959e774e2749d984355d1f9391a30a"}, + {file = "opentelemetry_sdk-1.29.0.tar.gz", hash = "sha256:b0787ce6aade6ab84315302e72bd7a7f2f014b0fb1b7c3295b88afe014ed0643"}, +] + +[package.dependencies] +opentelemetry-api = "1.29.0" +opentelemetry-semantic-conventions = "0.50b0" +typing-extensions = ">=3.7.4" + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.50b0" +description = "OpenTelemetry Semantic Conventions" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_semantic_conventions-0.50b0-py3-none-any.whl", hash = "sha256:e87efba8fdb67fb38113efea6a349531e75ed7ffc01562f65b802fcecb5e115e"}, + {file = "opentelemetry_semantic_conventions-0.50b0.tar.gz", hash = "sha256:02dc6dbcb62f082de9b877ff19a3f1ffaa3c306300fa53bfac761c4567c83d38"}, +] + +[package.dependencies] +deprecated = ">=1.2.6" +opentelemetry-api = "1.29.0" + +[[package]] +name = "opentelemetry-util-http" +version = "0.50b0" +description = "Web util for OpenTelemetry" +optional = false +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_util_http-0.50b0-py3-none-any.whl", hash = "sha256:21f8aedac861ffa3b850f8c0a6c373026189eb8630ac6e14a2bf8c55695cc090"}, + {file = "opentelemetry_util_http-0.50b0.tar.gz", hash = "sha256:dc4606027e1bc02aabb9533cc330dd43f874fca492e4175c31d7154f341754af"}, +] + [[package]] name = "orjson" version = "3.10.12" @@ -5028,6 +5848,29 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "posthog" +version = "3.7.5" +description = "Integrate PostHog into any python application." +optional = false +python-versions = "*" +files = [ + {file = "posthog-3.7.5-py2.py3-none-any.whl", hash = "sha256:022132c17069dde03c5c5904e2ae1b9bd68d5059cbc5a8dffc5c1537a1b71cb5"}, + {file = "posthog-3.7.5.tar.gz", hash = "sha256:8ba40ab623da35db72715fc87fe7dccb7fc272ced92581fe31db2d4dbe7ad761"}, +] + +[package.dependencies] +backoff = ">=1.10.0" +monotonic = ">=1.5" +python-dateutil = ">2.1" +requests = ">=2.7,<3.0" +six = ">=1.5" + +[package.extras] +dev = ["black", "flake8", "flake8-print", "isort", "pre-commit"] +sentry = ["django", "sentry-sdk"] +test = ["coverage", "django", "flake8", "freezegun (==0.3.15)", "mock (>=2.0.0)", "pylint", "pytest", "pytest-timeout"] + [[package]] name = "preshed" version = "3.0.9" @@ -5244,6 +6087,26 @@ files = [ {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, ] +[[package]] +name = "protobuf" +version = "5.29.2" +description = "" +optional = false +python-versions = ">=3.8" +files = [ + {file = "protobuf-5.29.2-cp310-abi3-win32.whl", hash = "sha256:c12ba8249f5624300cf51c3d0bfe5be71a60c63e4dcf51ffe9a68771d958c851"}, + {file = "protobuf-5.29.2-cp310-abi3-win_amd64.whl", hash = "sha256:842de6d9241134a973aab719ab42b008a18a90f9f07f06ba480df268f86432f9"}, + {file = "protobuf-5.29.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:a0c53d78383c851bfa97eb42e3703aefdc96d2036a41482ffd55dc5f529466eb"}, + {file = "protobuf-5.29.2-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:494229ecd8c9009dd71eda5fd57528395d1eacdf307dbece6c12ad0dd09e912e"}, + {file = "protobuf-5.29.2-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:b6b0d416bbbb9d4fbf9d0561dbfc4e324fd522f61f7af0fe0f282ab67b22477e"}, + {file = "protobuf-5.29.2-cp38-cp38-win32.whl", hash = "sha256:e621a98c0201a7c8afe89d9646859859be97cb22b8bf1d8eacfd90d5bda2eb19"}, + {file = "protobuf-5.29.2-cp38-cp38-win_amd64.whl", hash = "sha256:13d6d617a2a9e0e82a88113d7191a1baa1e42c2cc6f5f1398d3b054c8e7e714a"}, + {file = "protobuf-5.29.2-cp39-cp39-win32.whl", hash = "sha256:36000f97ea1e76e8398a3f02936aac2a5d2b111aae9920ec1b769fc4a222c4d9"}, + {file = "protobuf-5.29.2-cp39-cp39-win_amd64.whl", hash = "sha256:2d2e674c58a06311c8e99e74be43e7f3a8d1e2b2fdf845eaa347fbd866f23355"}, + {file = "protobuf-5.29.2-py3-none-any.whl", hash = "sha256:fde4554c0e578a5a0bcc9a276339594848d1e89f9ea47b4427c80e5d72f90181"}, + {file = "protobuf-5.29.2.tar.gz", hash = "sha256:b2cc8e8bb7c9326996f0e160137b0861f1a82162502658df2951209d0cb0309e"}, +] + [[package]] name = "psutil" version = "6.1.0" @@ -5364,6 +6227,31 @@ files = [ [package.extras] test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] +[[package]] +name = "pyasn1" +version = "0.6.1" +description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, + {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.1" +description = "A collection of ASN.1-based protocols modules" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd"}, + {file = "pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c"}, +] + +[package.dependencies] +pyasn1 = ">=0.4.6,<0.7.0" + [[package]] name = "pybtex" version = "0.24.0" @@ -5760,6 +6648,41 @@ files = [ {file = "pypdfium2-4.30.1.tar.gz", hash = "sha256:5f5c7c6d03598e107d974f66b220a49436aceb191da34cda5f692be098a814ce"}, ] +[[package]] +name = "pypika" +version = "0.48.9" +description = "A SQL query builder API for Python" +optional = false +python-versions = "*" +files = [ + {file = "PyPika-0.48.9.tar.gz", hash = "sha256:838836a61747e7c8380cd1b7ff638694b7a7335345d0f559b04b2cd832ad5378"}, +] + +[[package]] +name = "pyproject-hooks" +version = "1.2.0" +description = "Wrappers to call pyproject.toml-based build backend hooks." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913"}, + {file = "pyproject_hooks-1.2.0.tar.gz", hash = "sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8"}, +] + +[[package]] +name = "pyreadline3" +version = "3.5.4" +description = "A python implementation of GNU readline." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"}, + {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"}, +] + +[package.extras] +dev = ["build", "flake8", "mypy", "pytest", "twine"] + [[package]] name = "python-bidi" version = "0.6.3" @@ -6452,6 +7375,24 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +description = "OAuthlib authentication support for Requests." +optional = false +python-versions = ">=3.4" +files = [ + {file = "requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9"}, + {file = "requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36"}, +] + +[package.dependencies] +oauthlib = ">=3.0.0" +requests = ">=2.0.0" + +[package.extras] +rsa = ["oauthlib[signedtoken] (>=3.0.0)"] + [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -6624,6 +7565,20 @@ files = [ {file = "rpds_py-0.21.0.tar.gz", hash = "sha256:ed6378c9d66d0de903763e7706383d60c33829581f0adff47b6535f1802fa6db"}, ] +[[package]] +name = "rsa" +version = "4.9" +description = "Pure-Python RSA implementation" +optional = false +python-versions = ">=3.6,<4" +files = [ + {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, + {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, +] + +[package.dependencies] +pyasn1 = ">=0.1.3" + [[package]] name = "rtree" version = "1.3.0" @@ -6827,6 +7782,60 @@ docs = ["PyWavelets (>=1.6)", "dask[array] (>=2022.9.2)", "intersphinx-registry optional = ["PyWavelets (>=1.6)", "SimpleITK", "astropy (>=5.0)", "cloudpickle (>=0.2.1)", "dask[array] (>=2021.1.0,!=2024.8.0)", "matplotlib (>=3.7)", "pooch (>=1.6.0)", "pyamg (>=5.2)", "scikit-learn (>=1.2)"] test = ["asv", "numpydoc (>=1.7)", "pooch (>=1.6.0)", "pytest (>=7.0)", "pytest-cov (>=2.11.0)", "pytest-doctestplus", "pytest-faulthandler", "pytest-localserver"] +[[package]] +name = "scikit-learn" +version = "1.6.0" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scikit_learn-1.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:366fb3fa47dce90afed3d6106183f4978d6f24cfd595c2373424171b915ee718"}, + {file = "scikit_learn-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:59cd96a8d9f8dfd546f5d6e9787e1b989e981388d7803abbc9efdcde61e47460"}, + {file = "scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa7a579606c73a0b3d210e33ea410ea9e1af7933fe324cb7e6fbafae4ea5948"}, + {file = "scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a46d3ca0f11a540b8eaddaf5e38172d8cd65a86cb3e3632161ec96c0cffb774c"}, + {file = "scikit_learn-1.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:5be4577769c5dde6e1b53de8e6520f9b664ab5861dd57acee47ad119fd7405d6"}, + {file = "scikit_learn-1.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1f50b4f24cf12a81c3c09958ae3b864d7534934ca66ded3822de4996d25d7285"}, + {file = "scikit_learn-1.6.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:eb9ae21f387826da14b0b9cb1034f5048ddb9182da429c689f5f4a87dc96930b"}, + {file = "scikit_learn-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0baa91eeb8c32632628874a5c91885eaedd23b71504d24227925080da075837a"}, + {file = "scikit_learn-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c716d13ba0a2f8762d96ff78d3e0cde90bc9c9b5c13d6ab6bb9b2d6ca6705fd"}, + {file = "scikit_learn-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:9aafd94bafc841b626681e626be27bf1233d5a0f20f0a6fdb4bee1a1963c6643"}, + {file = "scikit_learn-1.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:04a5ba45c12a5ff81518aa4f1604e826a45d20e53da47b15871526cda4ff5174"}, + {file = "scikit_learn-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:21fadfc2ad7a1ce8bd1d90f23d17875b84ec765eecbbfc924ff11fb73db582ce"}, + {file = "scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30f34bb5fde90e020653bb84dcb38b6c83f90c70680dbd8c38bd9becbad7a127"}, + {file = "scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1dad624cffe3062276a0881d4e441bc9e3b19d02d17757cd6ae79a9d192a0027"}, + {file = "scikit_learn-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:2fce7950a3fad85e0a61dc403df0f9345b53432ac0e47c50da210d22c60b6d85"}, + {file = "scikit_learn-1.6.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e5453b2e87ef8accedc5a8a4e6709f887ca01896cd7cc8a174fe39bd4bb00aef"}, + {file = "scikit_learn-1.6.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:5fe11794236fb83bead2af26a87ced5d26e3370b8487430818b915dafab1724e"}, + {file = "scikit_learn-1.6.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61fe3dcec0d82ae280877a818ab652f4988371e32dd5451e75251bece79668b1"}, + {file = "scikit_learn-1.6.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b44e3a51e181933bdf9a4953cc69c6025b40d2b49e238233f149b98849beb4bf"}, + {file = "scikit_learn-1.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:a17860a562bac54384454d40b3f6155200c1c737c9399e6a97962c63fce503ac"}, + {file = "scikit_learn-1.6.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:98717d3c152f6842d36a70f21e1468fb2f1a2f8f2624d9a3f382211798516426"}, + {file = "scikit_learn-1.6.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:34e20bfac8ff0ebe0ff20fb16a4d6df5dc4cc9ce383e00c2ab67a526a3c67b18"}, + {file = "scikit_learn-1.6.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eba06d75815406091419e06dd650b91ebd1c5f836392a0d833ff36447c2b1bfa"}, + {file = "scikit_learn-1.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b6916d1cec1ff163c7d281e699d7a6a709da2f2c5ec7b10547e08cc788ddd3ae"}, + {file = "scikit_learn-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:66b1cf721a9f07f518eb545098226796c399c64abdcbf91c2b95d625068363da"}, + {file = "scikit_learn-1.6.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7b35b60cf4cd6564b636e4a40516b3c61a4fa7a8b1f7a3ce80c38ebe04750bc3"}, + {file = "scikit_learn-1.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a73b1c2038c93bc7f4bf21f6c9828d5116c5d2268f7a20cfbbd41d3074d52083"}, + {file = "scikit_learn-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c3fa7d3dd5a0ec2d0baba0d644916fa2ab180ee37850c5d536245df916946bd"}, + {file = "scikit_learn-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:df778486a32518cda33818b7e3ce48c78cef1d5f640a6bc9d97c6d2e71449a51"}, + {file = "scikit_learn-1.6.0.tar.gz", hash = "sha256:9d58481f9f7499dff4196927aedd4285a0baec8caa3790efbe205f13de37dd6e"}, +] + +[package.dependencies] +joblib = ">=1.2.0" +numpy = ">=1.19.5" +scipy = ">=1.6.0" +threadpoolctl = ">=3.1.0" + +[package.extras] +benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] +build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.17.1)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)", "towncrier (>=24.8.0)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] +maintenance = ["conda-lock (==2.5.6)"] +tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.5.1)", "scikit-image (>=0.17.2)"] + [[package]] name = "scipy" version = "1.14.1" @@ -6929,6 +7938,33 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"] objc = ["pyobjc-framework-Cocoa"] win32 = ["pywin32"] +[[package]] +name = "sentence-transformers" +version = "3.3.1" +description = "State-of-the-Art Text Embeddings" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sentence_transformers-3.3.1-py3-none-any.whl", hash = "sha256:abffcc79dab37b7d18d21a26d5914223dd42239cfe18cb5e111c66c54b658ae7"}, + {file = "sentence_transformers-3.3.1.tar.gz", hash = "sha256:9635dbfb11c6b01d036b9cfcee29f7716ab64cf2407ad9f403a2e607da2ac48b"}, +] + +[package.dependencies] +huggingface-hub = ">=0.20.0" +Pillow = "*" +scikit-learn = "*" +scipy = "*" +torch = ">=1.11.0" +tqdm = "*" +transformers = ">=4.41.0,<5.0.0" + +[package.extras] +dev = ["accelerate (>=0.20.3)", "datasets", "peft", "pre-commit", "pytest", "pytest-cov"] +onnx = ["optimum[onnxruntime] (>=1.23.1)"] +onnx-gpu = ["optimum[onnxruntime-gpu] (>=1.23.1)"] +openvino = ["optimum-intel[openvino] (>=1.20.0)"] +train = ["accelerate (>=0.20.3)", "datasets"] + [[package]] name = "sentencepiece" version = "0.2.0" @@ -7849,6 +8885,23 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "starlette" +version = "0.41.3" +description = "The little ASGI library that shines." +optional = false +python-versions = ">=3.8" +files = [ + {file = "starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"}, + {file = "starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] + [[package]] name = "structlog" version = "24.4.0" @@ -7908,6 +8961,21 @@ files = [ {file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"}, ] +[[package]] +name = "tenacity" +version = "9.0.0" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, + {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + [[package]] name = "terminado" version = "0.18.1" @@ -8015,6 +9083,17 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] +[[package]] +name = "threadpoolctl" +version = "3.5.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, + {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, +] + [[package]] name = "tifffile" version = "2024.12.12" @@ -8622,6 +9701,82 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "uvicorn" +version = "0.34.0" +description = "The lightning-fast ASGI server." +optional = false +python-versions = ">=3.9" +files = [ + {file = "uvicorn-0.34.0-py3-none-any.whl", hash = "sha256:023dc038422502fa28a09c7a30bf2b6991512da7dcdb8fd35fe57cfc154126f4"}, + {file = "uvicorn-0.34.0.tar.gz", hash = "sha256:404051050cd7e905de2c9a7e61790943440b3416f49cb409f965d9dcd0fa73e9"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", optional = true, markers = "sys_platform == \"win32\" and extra == \"standard\""} +h11 = ">=0.8" +httptools = {version = ">=0.6.3", optional = true, markers = "extra == \"standard\""} +python-dotenv = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} +pyyaml = {version = ">=5.1", optional = true, markers = "extra == \"standard\""} +uvloop = {version = ">=0.14.0,<0.15.0 || >0.15.0,<0.15.1 || >0.15.1", optional = true, markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\" and extra == \"standard\""} +watchfiles = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} +websockets = {version = ">=10.4", optional = true, markers = "extra == \"standard\""} + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] + +[[package]] +name = "uvloop" +version = "0.21.0" +description = "Fast implementation of asyncio event loop on top of libuv" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, + {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, + {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26"}, + {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb"}, + {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f"}, + {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c"}, + {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8"}, + {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0"}, + {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e"}, + {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb"}, + {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6"}, + {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d"}, + {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c"}, + {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2"}, + {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d"}, + {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc"}, + {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb"}, + {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f"}, + {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281"}, + {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af"}, + {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6"}, + {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816"}, + {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc"}, + {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553"}, + {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414"}, + {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206"}, + {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe"}, + {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79"}, + {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a"}, + {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc"}, + {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b"}, + {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2"}, + {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0"}, + {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75"}, + {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd"}, + {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff"}, + {file = "uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3"}, +] + +[package.extras] +dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["aiohttp (>=3.10.5)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] + [[package]] name = "wasabi" version = "1.1.3" @@ -8636,6 +9791,89 @@ files = [ [package.dependencies] colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python_version >= \"3.7\""} +[[package]] +name = "watchfiles" +version = "1.0.3" +description = "Simple, modern and high performance file watching and code reload in python." +optional = false +python-versions = ">=3.9" +files = [ + {file = "watchfiles-1.0.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:1da46bb1eefb5a37a8fb6fd52ad5d14822d67c498d99bda8754222396164ae42"}, + {file = "watchfiles-1.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2b961b86cd3973f5822826017cad7f5a75795168cb645c3a6b30c349094e02e3"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34e87c7b3464d02af87f1059fedda5484e43b153ef519e4085fe1a03dd94801e"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d9dd2b89a16cf7ab9c1170b5863e68de6bf83db51544875b25a5f05a7269e678"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b4691234d31686dca133c920f94e478b548a8e7c750f28dbbc2e4333e0d3da9"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90b0fe1fcea9bd6e3084b44875e179b4adcc4057a3b81402658d0eb58c98edf8"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0b90651b4cf9e158d01faa0833b073e2e37719264bcee3eac49fc3c74e7d304b"}, + {file = "watchfiles-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2e9fe695ff151b42ab06501820f40d01310fbd58ba24da8923ace79cf6d702d"}, + {file = "watchfiles-1.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62691f1c0894b001c7cde1195c03b7801aaa794a837bd6eef24da87d1542838d"}, + {file = "watchfiles-1.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:275c1b0e942d335fccb6014d79267d1b9fa45b5ac0639c297f1e856f2f532552"}, + {file = "watchfiles-1.0.3-cp310-cp310-win32.whl", hash = "sha256:06ce08549e49ba69ccc36fc5659a3d0ff4e3a07d542b895b8a9013fcab46c2dc"}, + {file = "watchfiles-1.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:f280b02827adc9d87f764972fbeb701cf5611f80b619c20568e1982a277d6146"}, + {file = "watchfiles-1.0.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ffe709b1d0bc2e9921257569675674cafb3a5f8af689ab9f3f2b3f88775b960f"}, + {file = "watchfiles-1.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:418c5ce332f74939ff60691e5293e27c206c8164ce2b8ce0d9abf013003fb7fe"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f492d2907263d6d0d52f897a68647195bc093dafed14508a8d6817973586b6b"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48c9f3bc90c556a854f4cab6a79c16974099ccfa3e3e150673d82d47a4bc92c9"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:75d3bcfa90454dba8df12adc86b13b6d85fda97d90e708efc036c2760cc6ba44"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5691340f259b8f76b45fb31b98e594d46c36d1dc8285efa7975f7f50230c9093"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1e263cc718545b7f897baeac1f00299ab6fabe3e18caaacacb0edf6d5f35513c"}, + {file = "watchfiles-1.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c6cf7709ed3e55704cc06f6e835bf43c03bc8e3cb8ff946bf69a2e0a78d9d77"}, + {file = "watchfiles-1.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:703aa5e50e465be901e0e0f9d5739add15e696d8c26c53bc6fc00eb65d7b9469"}, + {file = "watchfiles-1.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bfcae6aecd9e0cb425f5145afee871465b98b75862e038d42fe91fd753ddd780"}, + {file = "watchfiles-1.0.3-cp311-cp311-win32.whl", hash = "sha256:6a76494d2c5311584f22416c5a87c1e2cb954ff9b5f0988027bc4ef2a8a67181"}, + {file = "watchfiles-1.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:cf745cbfad6389c0e331786e5fe9ae3f06e9d9c2ce2432378e1267954793975c"}, + {file = "watchfiles-1.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:2dcc3f60c445f8ce14156854a072ceb36b83807ed803d37fdea2a50e898635d6"}, + {file = "watchfiles-1.0.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:93436ed550e429da007fbafb723e0769f25bae178fbb287a94cb4ccdf42d3af3"}, + {file = "watchfiles-1.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c18f3502ad0737813c7dad70e3e1cc966cc147fbaeef47a09463bbffe70b0a00"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a5bc3ca468bb58a2ef50441f953e1f77b9a61bd1b8c347c8223403dc9b4ac9a"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0d1ec043f02ca04bf21b1b32cab155ce90c651aaf5540db8eb8ad7f7e645cba8"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f58d3bfafecf3d81c15d99fc0ecf4319e80ac712c77cf0ce2661c8cf8bf84066"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1df924ba82ae9e77340101c28d56cbaff2c991bd6fe8444a545d24075abb0a87"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:632a52dcaee44792d0965c17bdfe5dc0edad5b86d6a29e53d6ad4bf92dc0ff49"}, + {file = "watchfiles-1.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bf4b459d94a0387617a1b499f314aa04d8a64b7a0747d15d425b8c8b151da0"}, + {file = "watchfiles-1.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca94c85911601b097d53caeeec30201736ad69a93f30d15672b967558df02885"}, + {file = "watchfiles-1.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:65ab1fb635476f6170b07e8e21db0424de94877e4b76b7feabfe11f9a5fc12b5"}, + {file = "watchfiles-1.0.3-cp312-cp312-win32.whl", hash = "sha256:49bc1bc26abf4f32e132652f4b3bfeec77d8f8f62f57652703ef127e85a3e38d"}, + {file = "watchfiles-1.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:48681c86f2cb08348631fed788a116c89c787fdf1e6381c5febafd782f6c3b44"}, + {file = "watchfiles-1.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:9e080cf917b35b20c889225a13f290f2716748362f6071b859b60b8847a6aa43"}, + {file = "watchfiles-1.0.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e153a690b7255c5ced17895394b4f109d5dcc2a4f35cb809374da50f0e5c456a"}, + {file = "watchfiles-1.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ac1be85fe43b4bf9a251978ce5c3bb30e1ada9784290441f5423a28633a958a7"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2ec98e31e1844eac860e70d9247db9d75440fc8f5f679c37d01914568d18721"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0179252846be03fa97d4d5f8233d1c620ef004855f0717712ae1c558f1974a16"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:995c374e86fa82126c03c5b4630c4e312327ecfe27761accb25b5e1d7ab50ec8"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29b9cb35b7f290db1c31fb2fdf8fc6d3730cfa4bca4b49761083307f441cac5a"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f8dc09ae69af50bead60783180f656ad96bd33ffbf6e7a6fce900f6d53b08f1"}, + {file = "watchfiles-1.0.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:489b80812f52a8d8c7b0d10f0d956db0efed25df2821c7a934f6143f76938bd6"}, + {file = "watchfiles-1.0.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:228e2247de583475d4cebf6b9af5dc9918abb99d1ef5ee737155bb39fb33f3c0"}, + {file = "watchfiles-1.0.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1550be1a5cb3be08a3fb84636eaafa9b7119b70c71b0bed48726fd1d5aa9b868"}, + {file = "watchfiles-1.0.3-cp313-cp313-win32.whl", hash = "sha256:16db2d7e12f94818cbf16d4c8938e4d8aaecee23826344addfaaa671a1527b07"}, + {file = "watchfiles-1.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:160eff7d1267d7b025e983ca8460e8cc67b328284967cbe29c05f3c3163711a3"}, + {file = "watchfiles-1.0.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c05b021f7b5aa333124f2a64d56e4cb9963b6efdf44e8d819152237bbd93ba15"}, + {file = "watchfiles-1.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:310505ad305e30cb6c5f55945858cdbe0eb297fc57378f29bacceb534ac34199"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ddff3f8b9fa24a60527c137c852d0d9a7da2a02cf2151650029fdc97c852c974"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46e86ed457c3486080a72bc837300dd200e18d08183f12b6ca63475ab64ed651"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f79fe7993e230a12172ce7d7c7db061f046f672f2b946431c81aff8f60b2758b"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea2b51c5f38bad812da2ec0cd7eec09d25f521a8b6b6843cbccedd9a1d8a5c15"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0fe4e740ea94978b2b2ab308cbf9270a246bcbb44401f77cc8740348cbaeac3d"}, + {file = "watchfiles-1.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9af037d3df7188ae21dc1c7624501f2f90d81be6550904e07869d8d0e6766655"}, + {file = "watchfiles-1.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:52bb50a4c4ca2a689fdba84ba8ecc6a4e6210f03b6af93181bb61c4ec3abaf86"}, + {file = "watchfiles-1.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c14a07bdb475eb696f85c715dbd0f037918ccbb5248290448488a0b4ef201aad"}, + {file = "watchfiles-1.0.3-cp39-cp39-win32.whl", hash = "sha256:be37f9b1f8934cd9e7eccfcb5612af9fb728fecbe16248b082b709a9d1b348bf"}, + {file = "watchfiles-1.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:ef9ec8068cf23458dbf36a08e0c16f0a2df04b42a8827619646637be1769300a"}, + {file = "watchfiles-1.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:84fac88278f42d61c519a6c75fb5296fd56710b05bbdcc74bdf85db409a03780"}, + {file = "watchfiles-1.0.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c68be72b1666d93b266714f2d4092d78dc53bd11cf91ed5a3c16527587a52e29"}, + {file = "watchfiles-1.0.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889a37e2acf43c377b5124166bece139b4c731b61492ab22e64d371cce0e6e80"}, + {file = "watchfiles-1.0.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca05cacf2e5c4a97d02a2878a24020daca21dbb8823b023b978210a75c79098"}, + {file = "watchfiles-1.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8af4b582d5fc1b8465d1d2483e5e7b880cc1a4e99f6ff65c23d64d070867ac58"}, + {file = "watchfiles-1.0.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:127de3883bdb29dbd3b21f63126bb8fa6e773b74eaef46521025a9ce390e1073"}, + {file = "watchfiles-1.0.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:713f67132346bdcb4c12df185c30cf04bdf4bf6ea3acbc3ace0912cab6b7cb8c"}, + {file = "watchfiles-1.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abd85de513eb83f5ec153a802348e7a5baa4588b818043848247e3e8986094e8"}, + {file = "watchfiles-1.0.3.tar.gz", hash = "sha256:f3ff7da165c99a5412fe5dd2304dd2dbaaaa5da718aad942dcb3a178eaa70c56"}, +] + +[package.dependencies] +anyio = ">=3.0.0" + [[package]] name = "wcwidth" version = "0.2.13" @@ -8707,6 +9945,84 @@ docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "14.1" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "websockets-14.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a0adf84bc2e7c86e8a202537b4fd50e6f7f0e4a6b6bf64d7ccb96c4cd3330b29"}, + {file = "websockets-14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90b5d9dfbb6d07a84ed3e696012610b6da074d97453bd01e0e30744b472c8179"}, + {file = "websockets-14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2177ee3901075167f01c5e335a6685e71b162a54a89a56001f1c3e9e3d2ad250"}, + {file = "websockets-14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f14a96a0034a27f9d47fd9788913924c89612225878f8078bb9d55f859272b0"}, + {file = "websockets-14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f874ba705deea77bcf64a9da42c1f5fc2466d8f14daf410bc7d4ceae0a9fcb0"}, + {file = "websockets-14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9607b9a442392e690a57909c362811184ea429585a71061cd5d3c2b98065c199"}, + {file = "websockets-14.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bea45f19b7ca000380fbd4e02552be86343080120d074b87f25593ce1700ad58"}, + {file = "websockets-14.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:219c8187b3ceeadbf2afcf0f25a4918d02da7b944d703b97d12fb01510869078"}, + {file = "websockets-14.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ad2ab2547761d79926effe63de21479dfaf29834c50f98c4bf5b5480b5838434"}, + {file = "websockets-14.1-cp310-cp310-win32.whl", hash = "sha256:1288369a6a84e81b90da5dbed48610cd7e5d60af62df9851ed1d1d23a9069f10"}, + {file = "websockets-14.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0744623852f1497d825a49a99bfbec9bea4f3f946df6eb9d8a2f0c37a2fec2e"}, + {file = "websockets-14.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:449d77d636f8d9c17952628cc7e3b8faf6e92a17ec581ec0c0256300717e1512"}, + {file = "websockets-14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a35f704be14768cea9790d921c2c1cc4fc52700410b1c10948511039be824aac"}, + {file = "websockets-14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b1f3628a0510bd58968c0f60447e7a692933589b791a6b572fcef374053ca280"}, + {file = "websockets-14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c3deac3748ec73ef24fc7be0b68220d14d47d6647d2f85b2771cb35ea847aa1"}, + {file = "websockets-14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7048eb4415d46368ef29d32133134c513f507fff7d953c18c91104738a68c3b3"}, + {file = "websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6cf0ad281c979306a6a34242b371e90e891bce504509fb6bb5246bbbf31e7b6"}, + {file = "websockets-14.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc1fc87428c1d18b643479caa7b15db7d544652e5bf610513d4a3478dbe823d0"}, + {file = "websockets-14.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f95ba34d71e2fa0c5d225bde3b3bdb152e957150100e75c86bc7f3964c450d89"}, + {file = "websockets-14.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9481a6de29105d73cf4515f2bef8eb71e17ac184c19d0b9918a3701c6c9c4f23"}, + {file = "websockets-14.1-cp311-cp311-win32.whl", hash = "sha256:368a05465f49c5949e27afd6fbe0a77ce53082185bbb2ac096a3a8afaf4de52e"}, + {file = "websockets-14.1-cp311-cp311-win_amd64.whl", hash = "sha256:6d24fc337fc055c9e83414c94e1ee0dee902a486d19d2a7f0929e49d7d604b09"}, + {file = "websockets-14.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ed907449fe5e021933e46a3e65d651f641975a768d0649fee59f10c2985529ed"}, + {file = "websockets-14.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:87e31011b5c14a33b29f17eb48932e63e1dcd3fa31d72209848652310d3d1f0d"}, + {file = "websockets-14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bc6ccf7d54c02ae47a48ddf9414c54d48af9c01076a2e1023e3b486b6e72c707"}, + {file = "websockets-14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9777564c0a72a1d457f0848977a1cbe15cfa75fa2f67ce267441e465717dcf1a"}, + {file = "websockets-14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a655bde548ca98f55b43711b0ceefd2a88a71af6350b0c168aa77562104f3f45"}, + {file = "websockets-14.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3dfff83ca578cada2d19e665e9c8368e1598d4e787422a460ec70e531dbdd58"}, + {file = "websockets-14.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6a6c9bcf7cdc0fd41cc7b7944447982e8acfd9f0d560ea6d6845428ed0562058"}, + {file = "websockets-14.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4b6caec8576e760f2c7dd878ba817653144d5f369200b6ddf9771d64385b84d4"}, + {file = "websockets-14.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb6d38971c800ff02e4a6afd791bbe3b923a9a57ca9aeab7314c21c84bf9ff05"}, + {file = "websockets-14.1-cp312-cp312-win32.whl", hash = "sha256:1d045cbe1358d76b24d5e20e7b1878efe578d9897a25c24e6006eef788c0fdf0"}, + {file = "websockets-14.1-cp312-cp312-win_amd64.whl", hash = "sha256:90f4c7a069c733d95c308380aae314f2cb45bd8a904fb03eb36d1a4983a4993f"}, + {file = "websockets-14.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3630b670d5057cd9e08b9c4dab6493670e8e762a24c2c94ef312783870736ab9"}, + {file = "websockets-14.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36ebd71db3b89e1f7b1a5deaa341a654852c3518ea7a8ddfdf69cc66acc2db1b"}, + {file = "websockets-14.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5b918d288958dc3fa1c5a0b9aa3256cb2b2b84c54407f4813c45d52267600cd3"}, + {file = "websockets-14.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00fe5da3f037041da1ee0cf8e308374e236883f9842c7c465aa65098b1c9af59"}, + {file = "websockets-14.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8149a0f5a72ca36720981418eeffeb5c2729ea55fa179091c81a0910a114a5d2"}, + {file = "websockets-14.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77569d19a13015e840b81550922056acabc25e3f52782625bc6843cfa034e1da"}, + {file = "websockets-14.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cf5201a04550136ef870aa60ad3d29d2a59e452a7f96b94193bee6d73b8ad9a9"}, + {file = "websockets-14.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:88cf9163ef674b5be5736a584c999e98daf3aabac6e536e43286eb74c126b9c7"}, + {file = "websockets-14.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:836bef7ae338a072e9d1863502026f01b14027250a4545672673057997d5c05a"}, + {file = "websockets-14.1-cp313-cp313-win32.whl", hash = "sha256:0d4290d559d68288da9f444089fd82490c8d2744309113fc26e2da6e48b65da6"}, + {file = "websockets-14.1-cp313-cp313-win_amd64.whl", hash = "sha256:8621a07991add373c3c5c2cf89e1d277e49dc82ed72c75e3afc74bd0acc446f0"}, + {file = "websockets-14.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01bb2d4f0a6d04538d3c5dfd27c0643269656c28045a53439cbf1c004f90897a"}, + {file = "websockets-14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:414ffe86f4d6f434a8c3b7913655a1a5383b617f9bf38720e7c0799fac3ab1c6"}, + {file = "websockets-14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8fda642151d5affdee8a430bd85496f2e2517be3a2b9d2484d633d5712b15c56"}, + {file = "websockets-14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd7c11968bc3860d5c78577f0dbc535257ccec41750675d58d8dc66aa47fe52c"}, + {file = "websockets-14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a032855dc7db987dff813583d04f4950d14326665d7e714d584560b140ae6b8b"}, + {file = "websockets-14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7e7ea2f782408c32d86b87a0d2c1fd8871b0399dd762364c731d86c86069a78"}, + {file = "websockets-14.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:39450e6215f7d9f6f7bc2a6da21d79374729f5d052333da4d5825af8a97e6735"}, + {file = "websockets-14.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ceada5be22fa5a5a4cdeec74e761c2ee7db287208f54c718f2df4b7e200b8d4a"}, + {file = "websockets-14.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3fc753451d471cff90b8f467a1fc0ae64031cf2d81b7b34e1811b7e2691bc4bc"}, + {file = "websockets-14.1-cp39-cp39-win32.whl", hash = "sha256:14839f54786987ccd9d03ed7f334baec0f02272e7ec4f6e9d427ff584aeea8b4"}, + {file = "websockets-14.1-cp39-cp39-win_amd64.whl", hash = "sha256:d9fd19ecc3a4d5ae82ddbfb30962cf6d874ff943e56e0c81f5169be2fda62979"}, + {file = "websockets-14.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5dc25a9dbd1a7f61eca4b7cb04e74ae4b963d658f9e4f9aad9cd00b688692c8"}, + {file = "websockets-14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:04a97aca96ca2acedf0d1f332c861c5a4486fdcba7bcef35873820f940c4231e"}, + {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df174ece723b228d3e8734a6f2a6febbd413ddec39b3dc592f5a4aa0aff28098"}, + {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:034feb9f4286476f273b9a245fb15f02c34d9586a5bc936aff108c3ba1b21beb"}, + {file = "websockets-14.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c308dabd2b380807ab64b62985eaccf923a78ebc572bd485375b9ca2b7dc7"}, + {file = "websockets-14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a42d3ecbb2db5080fc578314439b1d79eef71d323dc661aa616fb492436af5d"}, + {file = "websockets-14.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ddaa4a390af911da6f680be8be4ff5aaf31c4c834c1a9147bc21cbcbca2d4370"}, + {file = "websockets-14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a4c805c6034206143fbabd2d259ec5e757f8b29d0a2f0bf3d2fe5d1f60147a4a"}, + {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:205f672a6c2c671a86d33f6d47c9b35781a998728d2c7c2a3e1cf3333fcb62b7"}, + {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef440054124728cc49b01c33469de06755e5a7a4e83ef61934ad95fc327fbb0"}, + {file = "websockets-14.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7591d6f440af7f73c4bd9404f3772bfee064e639d2b6cc8c94076e71b2471c1"}, + {file = "websockets-14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:25225cc79cfebc95ba1d24cd3ab86aaa35bcd315d12fa4358939bd55e9bd74a5"}, + {file = "websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e"}, + {file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"}, +] + [[package]] name = "wheel" version = "0.45.0" @@ -8732,6 +10048,80 @@ files = [ {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"}, ] +[[package]] +name = "wrapt" +version = "1.17.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = ">=3.8" +files = [ + {file = "wrapt-1.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a0c23b8319848426f305f9cb0c98a6e32ee68a36264f45948ccf8e7d2b941f8"}, + {file = "wrapt-1.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1ca5f060e205f72bec57faae5bd817a1560fcfc4af03f414b08fa29106b7e2d"}, + {file = "wrapt-1.17.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e185ec6060e301a7e5f8461c86fb3640a7beb1a0f0208ffde7a65ec4074931df"}, + {file = "wrapt-1.17.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb90765dd91aed05b53cd7a87bd7f5c188fcd95960914bae0d32c5e7f899719d"}, + {file = "wrapt-1.17.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:879591c2b5ab0a7184258274c42a126b74a2c3d5a329df16d69f9cee07bba6ea"}, + {file = "wrapt-1.17.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fce6fee67c318fdfb7f285c29a82d84782ae2579c0e1b385b7f36c6e8074fffb"}, + {file = "wrapt-1.17.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0698d3a86f68abc894d537887b9bbf84d29bcfbc759e23f4644be27acf6da301"}, + {file = "wrapt-1.17.0-cp310-cp310-win32.whl", hash = "sha256:69d093792dc34a9c4c8a70e4973a3361c7a7578e9cd86961b2bbf38ca71e4e22"}, + {file = "wrapt-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:f28b29dc158ca5d6ac396c8e0a2ef45c4e97bb7e65522bfc04c989e6fe814575"}, + {file = "wrapt-1.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74bf625b1b4caaa7bad51d9003f8b07a468a704e0644a700e936c357c17dd45a"}, + {file = "wrapt-1.17.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f2a28eb35cf99d5f5bd12f5dd44a0f41d206db226535b37b0c60e9da162c3ed"}, + {file = "wrapt-1.17.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:81b1289e99cf4bad07c23393ab447e5e96db0ab50974a280f7954b071d41b489"}, + {file = "wrapt-1.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f2939cd4a2a52ca32bc0b359015718472d7f6de870760342e7ba295be9ebaf9"}, + {file = "wrapt-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6a9653131bda68a1f029c52157fd81e11f07d485df55410401f745007bd6d339"}, + {file = "wrapt-1.17.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4e4b4385363de9052dac1a67bfb535c376f3d19c238b5f36bddc95efae15e12d"}, + {file = "wrapt-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bdf62d25234290db1837875d4dceb2151e4ea7f9fff2ed41c0fde23ed542eb5b"}, + {file = "wrapt-1.17.0-cp311-cp311-win32.whl", hash = "sha256:5d8fd17635b262448ab8f99230fe4dac991af1dabdbb92f7a70a6afac8a7e346"}, + {file = "wrapt-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:92a3d214d5e53cb1db8b015f30d544bc9d3f7179a05feb8f16df713cecc2620a"}, + {file = "wrapt-1.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:89fc28495896097622c3fc238915c79365dd0ede02f9a82ce436b13bd0ab7569"}, + {file = "wrapt-1.17.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:875d240fdbdbe9e11f9831901fb8719da0bd4e6131f83aa9f69b96d18fae7504"}, + {file = "wrapt-1.17.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ed16d95fd142e9c72b6c10b06514ad30e846a0d0917ab406186541fe68b451"}, + {file = "wrapt-1.17.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18b956061b8db634120b58f668592a772e87e2e78bc1f6a906cfcaa0cc7991c1"}, + {file = "wrapt-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:daba396199399ccabafbfc509037ac635a6bc18510ad1add8fd16d4739cdd106"}, + {file = "wrapt-1.17.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4d63f4d446e10ad19ed01188d6c1e1bb134cde8c18b0aa2acfd973d41fcc5ada"}, + {file = "wrapt-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8a5e7cc39a45fc430af1aefc4d77ee6bad72c5bcdb1322cfde852c15192b8bd4"}, + {file = "wrapt-1.17.0-cp312-cp312-win32.whl", hash = "sha256:0a0a1a1ec28b641f2a3a2c35cbe86c00051c04fffcfcc577ffcdd707df3f8635"}, + {file = "wrapt-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:3c34f6896a01b84bab196f7119770fd8466c8ae3dfa73c59c0bb281e7b588ce7"}, + {file = "wrapt-1.17.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:714c12485aa52efbc0fc0ade1e9ab3a70343db82627f90f2ecbc898fdf0bb181"}, + {file = "wrapt-1.17.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da427d311782324a376cacb47c1a4adc43f99fd9d996ffc1b3e8529c4074d393"}, + {file = "wrapt-1.17.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba1739fb38441a27a676f4de4123d3e858e494fac05868b7a281c0a383c098f4"}, + {file = "wrapt-1.17.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e711fc1acc7468463bc084d1b68561e40d1eaa135d8c509a65dd534403d83d7b"}, + {file = "wrapt-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:140ea00c87fafc42739bd74a94a5a9003f8e72c27c47cd4f61d8e05e6dec8721"}, + {file = "wrapt-1.17.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:73a96fd11d2b2e77d623a7f26e004cc31f131a365add1ce1ce9a19e55a1eef90"}, + {file = "wrapt-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0b48554952f0f387984da81ccfa73b62e52817a4386d070c75e4db7d43a28c4a"}, + {file = "wrapt-1.17.0-cp313-cp313-win32.whl", hash = "sha256:498fec8da10e3e62edd1e7368f4b24aa362ac0ad931e678332d1b209aec93045"}, + {file = "wrapt-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd136bb85f4568fffca995bd3c8d52080b1e5b225dbf1c2b17b66b4c5fa02838"}, + {file = "wrapt-1.17.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:17fcf043d0b4724858f25b8826c36e08f9fb2e475410bece0ec44a22d533da9b"}, + {file = "wrapt-1.17.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4a557d97f12813dc5e18dad9fa765ae44ddd56a672bb5de4825527c847d6379"}, + {file = "wrapt-1.17.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0229b247b0fc7dee0d36176cbb79dbaf2a9eb7ecc50ec3121f40ef443155fb1d"}, + {file = "wrapt-1.17.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8425cfce27b8b20c9b89d77fb50e368d8306a90bf2b6eef2cdf5cd5083adf83f"}, + {file = "wrapt-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9c900108df470060174108012de06d45f514aa4ec21a191e7ab42988ff42a86c"}, + {file = "wrapt-1.17.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:4e547b447073fc0dbfcbff15154c1be8823d10dab4ad401bdb1575e3fdedff1b"}, + {file = "wrapt-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:914f66f3b6fc7b915d46c1cc424bc2441841083de01b90f9e81109c9759e43ab"}, + {file = "wrapt-1.17.0-cp313-cp313t-win32.whl", hash = "sha256:a4192b45dff127c7d69b3bdfb4d3e47b64179a0b9900b6351859f3001397dabf"}, + {file = "wrapt-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:4f643df3d4419ea3f856c5c3f40fec1d65ea2e89ec812c83f7767c8730f9827a"}, + {file = "wrapt-1.17.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:69c40d4655e078ede067a7095544bcec5a963566e17503e75a3a3e0fe2803b13"}, + {file = "wrapt-1.17.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f495b6754358979379f84534f8dd7a43ff8cff2558dcdea4a148a6e713a758f"}, + {file = "wrapt-1.17.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:baa7ef4e0886a6f482e00d1d5bcd37c201b383f1d314643dfb0367169f94f04c"}, + {file = "wrapt-1.17.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8fc931382e56627ec4acb01e09ce66e5c03c384ca52606111cee50d931a342d"}, + {file = "wrapt-1.17.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8f8909cdb9f1b237786c09a810e24ee5e15ef17019f7cecb207ce205b9b5fcce"}, + {file = "wrapt-1.17.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ad47b095f0bdc5585bced35bd088cbfe4177236c7df9984b3cc46b391cc60627"}, + {file = "wrapt-1.17.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:948a9bd0fb2c5120457b07e59c8d7210cbc8703243225dbd78f4dfc13c8d2d1f"}, + {file = "wrapt-1.17.0-cp38-cp38-win32.whl", hash = "sha256:5ae271862b2142f4bc687bdbfcc942e2473a89999a54231aa1c2c676e28f29ea"}, + {file = "wrapt-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:f335579a1b485c834849e9075191c9898e0731af45705c2ebf70e0cd5d58beed"}, + {file = "wrapt-1.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d751300b94e35b6016d4b1e7d0e7bbc3b5e1751e2405ef908316c2a9024008a1"}, + {file = "wrapt-1.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7264cbb4a18dc4acfd73b63e4bcfec9c9802614572025bdd44d0721983fc1d9c"}, + {file = "wrapt-1.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:33539c6f5b96cf0b1105a0ff4cf5db9332e773bb521cc804a90e58dc49b10578"}, + {file = "wrapt-1.17.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c30970bdee1cad6a8da2044febd824ef6dc4cc0b19e39af3085c763fdec7de33"}, + {file = "wrapt-1.17.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bc7f729a72b16ee21795a943f85c6244971724819819a41ddbaeb691b2dd85ad"}, + {file = "wrapt-1.17.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6ff02a91c4fc9b6a94e1c9c20f62ea06a7e375f42fe57587f004d1078ac86ca9"}, + {file = "wrapt-1.17.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2dfb7cff84e72e7bf975b06b4989477873dcf160b2fd89959c629535df53d4e0"}, + {file = "wrapt-1.17.0-cp39-cp39-win32.whl", hash = "sha256:2399408ac33ffd5b200480ee858baa58d77dd30e0dd0cab6a8a9547135f30a88"}, + {file = "wrapt-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:4f763a29ee6a20c529496a20a7bcb16a73de27f5da6a843249c7047daf135977"}, + {file = "wrapt-1.17.0-py3-none-any.whl", hash = "sha256:d2c63b93548eda58abf5188e505ffed0229bf675f7c3090f8e36ad55b8cbc371"}, + {file = "wrapt-1.17.0.tar.gz", hash = "sha256:16187aa2317c731170a88ef35e8937ae0f533c402872c1ee5e6d079fcf320801"}, +] + [[package]] name = "xlsxwriter" version = "3.2.0" @@ -9019,4 +10409,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.13" -content-hash = "267d5b89cbb904d2eef66000c38aa60c7357ee191ddee1c8b417de7960ff261c" +content-hash = "444815d4d4fd2ec1f07a982de2474f76e35250395b0b634d78695a52b3056d6c" diff --git a/pyproject.toml b/pyproject.toml index 679bf8b..2e8086f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,8 @@ markitdown = "^0.0.1a3" docling = "^2.14.0" python-levenshtein = "^0.26.1" sphinx-math-dollar = "^1.2.1" +chromadb = "^0.6.1" +sentence-transformers = "^3.3.1" [build-system] diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle index e38c5d9..1ec60c8 100644 Binary files a/tamingllms/_build/.doctrees/environment.pickle and b/tamingllms/_build/.doctrees/environment.pickle differ diff --git a/tamingllms/_build/.doctrees/markdown/intro.doctree b/tamingllms/_build/.doctrees/markdown/intro.doctree index 183fb21..fb094c2 100644 Binary files a/tamingllms/_build/.doctrees/markdown/intro.doctree and b/tamingllms/_build/.doctrees/markdown/intro.doctree differ diff --git a/tamingllms/_build/.doctrees/markdown/preface.doctree b/tamingllms/_build/.doctrees/markdown/preface.doctree index b874834..047aad8 100644 Binary files a/tamingllms/_build/.doctrees/markdown/preface.doctree and b/tamingllms/_build/.doctrees/markdown/preface.doctree differ diff --git a/tamingllms/_build/.doctrees/markdown/toc.doctree b/tamingllms/_build/.doctrees/markdown/toc.doctree index 8681705..efae79c 100644 Binary files a/tamingllms/_build/.doctrees/markdown/toc.doctree and b/tamingllms/_build/.doctrees/markdown/toc.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree index f76948f..526d1b5 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/alignment.doctree and b/tamingllms/_build/.doctrees/notebooks/alignment.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/cost.doctree b/tamingllms/_build/.doctrees/notebooks/cost.doctree index c67f01a..7b68420 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/cost.doctree and b/tamingllms/_build/.doctrees/notebooks/cost.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree index 4da6a2f..1d0bc7b 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/evals.doctree and b/tamingllms/_build/.doctrees/notebooks/evals.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/input.doctree b/tamingllms/_build/.doctrees/notebooks/input.doctree index 70b1063..de98fc7 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/input.doctree and b/tamingllms/_build/.doctrees/notebooks/input.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/local.doctree b/tamingllms/_build/.doctrees/notebooks/local.doctree index 0b39c53..045a4b3 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/local.doctree and b/tamingllms/_build/.doctrees/notebooks/local.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree index 834a0e5..89ede2e 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/safety.doctree and b/tamingllms/_build/.doctrees/notebooks/safety.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree index a5ce8f8..6658553 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree and b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree differ diff --git a/tamingllms/_build/html/_images/LC.png b/tamingllms/_build/html/_images/LC.png new file mode 100644 index 0000000..72602d1 Binary files /dev/null and b/tamingllms/_build/html/_images/LC.png differ diff --git a/tamingllms/_build/html/_images/embedding.svg b/tamingllms/_build/html/_images/embedding.svg new file mode 100644 index 0000000..adbe91b --- /dev/null +++ b/tamingllms/_build/html/_images/embedding.svg @@ -0,0 +1,118 @@ + + + + + + + + +EmbeddingWho is the Author of...model[0.123, 0.456, 0.789, ...]all-MiniLM-L6-v2 + + + + + + + \ No newline at end of file diff --git a/tamingllms/_build/html/_images/incontext.svg b/tamingllms/_build/html/_images/incontext.svg new file mode 100644 index 0000000..82c636f --- /dev/null +++ b/tamingllms/_build/html/_images/incontext.svg @@ -0,0 +1,4 @@ + + + +
Retrieval
Retrieval
RAG Context
RAG Context
reranking
reranking
Query
Query

LLM

LLM

Context Window

Context Wi...
Retrieval System
Retrieval System
VectorDB
VectorDB
\ No newline at end of file diff --git a/tamingllms/_build/html/_images/llm_judge.png b/tamingllms/_build/html/_images/llm_judge.png new file mode 100644 index 0000000..deeea05 Binary files /dev/null and b/tamingllms/_build/html/_images/llm_judge.png differ diff --git a/tamingllms/_build/html/_images/llm_judge.svg b/tamingllms/_build/html/_images/llm_judge.svg deleted file mode 100644 index 4292dfa..0000000 --- a/tamingllms/_build/html/_images/llm_judge.svg +++ /dev/null @@ -1,879 +0,0 @@ -LLM Judge Evaluation SystemLLM-Judgecomponentsapps
App Rankings
-Detailed Scores
-Analysis Report
-
-
Task description
-Scoring guidelines
-Output format
-
-
(Optional)Ground TruthLLM App 1LLM App 2

...

-
LLM App N Generate EvaluationPrompt Compare ResultsSubmit for Review - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/tamingllms/_build/html/_images/meta2.png b/tamingllms/_build/html/_images/meta2.png new file mode 100644 index 0000000..93f0a9b Binary files /dev/null and b/tamingllms/_build/html/_images/meta2.png differ diff --git a/tamingllms/_build/html/_images/meta2.svg b/tamingllms/_build/html/_images/meta2.svg deleted file mode 100644 index 8833843..0000000 --- a/tamingllms/_build/html/_images/meta2.svg +++ /dev/null @@ -1,882 +0,0 @@ -LLM Judge Pairwise Evaluation SystemPool of LLM JudgesPairwiseSelectorllmcomparison_pairHumanEvaluatorsRankingAlgorithm
LLM Judges Leaderboard
----------------------
-1. Judge C (0.95)
-2. Judge A (0.92)
-3. Judge B (0.89)
-   ...
-N. Judge X (0.75)
-
-
PromptLLM ResponseJudge AvsJudge B Draw JudgesGenerate PairInput forEvaluationEvaluatePreferencesGenerateRankings - - - - - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/tamingllms/_build/html/_images/rag.svg b/tamingllms/_build/html/_images/rag.svg new file mode 100644 index 0000000..6b77e28 --- /dev/null +++ b/tamingllms/_build/html/_images/rag.svg @@ -0,0 +1,4 @@ + + + +
Data Parsing & Ingestion
Data
Embeddings
Retrieval
RAG Context
reranking
Query

LLM

Context Window

Indexing
Query
User
VectorDB
Retrieval System
RAG
\ No newline at end of file diff --git a/tamingllms/_build/html/_images/similarity.png b/tamingllms/_build/html/_images/similarity.png new file mode 100644 index 0000000..4f2f228 Binary files /dev/null and b/tamingllms/_build/html/_images/similarity.png differ diff --git a/tamingllms/_build/html/_sources/markdown/intro.md b/tamingllms/_build/html/_sources/markdown/intro.md index a3879a7..ab10fe5 100644 --- a/tamingllms/_build/html/_sources/markdown/intro.md +++ b/tamingllms/_build/html/_sources/markdown/intro.md @@ -35,11 +35,15 @@ Throughout this book, we'll tackle the following (non-exhaustive) list of critic 3. **Testing Complexity**: Traditional software testing methodologies break down when dealing with non-deterministic and generative systems, requiring new approaches. -4. **Safety and Alignment**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. +4. **Safety**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. -5. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. +5. **Alignment**: LLMs are next-token prediction models, which means they are not aligned with the user's preferences by default. -6. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. +6. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. + +7. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. + +We conclude with a discussion on the future of LLMs and the challenges that will arise as we move forward. ## A Practical Approach @@ -171,7 +175,7 @@ Now that your environment is set up, let's begin our exploration of LLM challeng ## About the Author -Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI1 ecosystem. +Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI ecosystem. With over 15 years of experience delivering technology products across startups and Fortune 500 companies, he is also an author of numerous scholarly publications and a frequent speaker at academic and business conferences. Grounded on academic background and drawing from practical experience building and scaling up products powered by language models at early-stage startups, major institutions as well as contributing to open source projects, he brings a unique perspective on bridging the gap between LLMs promised potential and their practical implementation challenges to enable the next generation of AI-powered products. diff --git a/tamingllms/_build/html/_sources/markdown/toc.md b/tamingllms/_build/html/_sources/markdown/toc.md index 6b39520..c343795 100644 --- a/tamingllms/_build/html/_sources/markdown/toc.md +++ b/tamingllms/_build/html/_sources/markdown/toc.md @@ -43,4 +43,14 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo [cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/ [cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png -[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg \ No newline at end of file +[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg + +``` +@misc{tharsistpsouza2024tamingllms, + author = {Tharsis T. P. Souza}, + title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software}, + year = {2024}, + journal = {GitHub repository}, + url = {https://github.com/souzatharsis/tamingLLMs) +} +``` \ No newline at end of file diff --git a/tamingllms/_build/html/_sources/notebooks/cost.ipynb b/tamingllms/_build/html/_sources/notebooks/cost.ipynb index 4cd6849..5a1bc87 100644 --- a/tamingllms/_build/html/_sources/notebooks/cost.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/cost.ipynb @@ -315,7 +315,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "Quantization[^visual-quantization] is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^visual-quantization]: Maarten Grootendorst provides the best visual guide for model quantization {cite}`grootendorst2024quantization`.\n", "\n", "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", "\n", diff --git a/tamingllms/_build/html/_sources/notebooks/evals.ipynb b/tamingllms/_build/html/_sources/notebooks/evals.ipynb index 002eb4a..68b2390 100644 --- a/tamingllms/_build/html/_sources/notebooks/evals.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/evals.ipynb @@ -853,7 +853,7 @@ "4. **Run Evaluations**: Use the judge model to score outputs. Consider using a large and/or more capable model as a judge to provide more nuanced assessments.\n", "5. **Aggregate and Analyze Results**: Interpret scores to refine applications.\n", "\n", - "```{figure} ../_static/evals/llm_judge.svg\n", + "```{figure} ../_static/evals/llm_judge.png\n", "---\n", "name: llm_judge\n", "alt: Conceptual Overview\n", @@ -1187,11 +1187,11 @@ "\n", "An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is [Judge Arena](https://judgearena.com/) {cite}`judgearena2024`, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in {numref}`meta2`. Only after submitting a vote, users can see which models were actually doing the judging.\n", "\n", - "```{figure} ../_static/evals/meta2.svg\n", + "```{figure} ../_static/evals/meta2.png\n", "---\n", "name: meta2\n", "alt: Human-in-the-loop meta evaluation Conceptual Overview\n", - "scale: 60%\n", + "scale: 75%\n", "align: center\n", "---\n", "Human-in-the-loop Meta Evaluation.\n", diff --git a/tamingllms/_build/html/_sources/notebooks/input.ipynb b/tamingllms/_build/html/_sources/notebooks/input.ipynb index a8d6b4c..8397a78 100644 --- a/tamingllms/_build/html/_sources/notebooks/input.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/input.ipynb @@ -12,11 +12,6 @@ "-- Steve Jobs\n", "```\n", "```{contents}\n", - "```\n", - "\n", - "\n", - "```{note}\n", - "This Chapter is Work-in-Progress.\n", "```" ] }, @@ -26,20 +21,22 @@ "source": [ "## Introduction\n", "\n", - "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n", + "While advances in long-context language models (LCs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process, significant challenges remain in managing and effectively utilizing extended data inputs:\n", "\n", - "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", + "- LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`.\n", + "- They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`.\n", + "- LLMs also face \"lost-in-the-middle\" problems {cite}`wu2024longdocumentsummaryevaluation` and struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", "\n", - "Motivated by these challenges, this chapter explores two key components:\n", + "Motivated by these challenges, this chapter explores two key input data components:\n", "\n", - "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n", + "1. Data Parsing and Chunking: Parsing and chunking documents into a unified format that is suitable and more manageable for LLMs to process.\n", "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n", "\n", "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n", "\n", - "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n", + "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation) while discussing whether RAGs will be really needed in the future given the rise of long-context language models.\n", "\n", - "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n", + "While RAGs are useful for incorporating external context, they are not a silver bullet nor a mandatory component for all LLM applications. In our last case study, we leverage long-context windows to build a quiz generator from a large knowledge base. We will also explore some additional relevant techniques such as prompt caching and response verification through citations.\n", "\n", "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases." ] @@ -50,9 +47,11 @@ "source": [ "## Parsing Documents\n", "\n", - "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n", + "Data parsing and formatting play a critical role in LLMs performance {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`. Hence, building robust data ingestion and preprocessing pipelines is essential for any LLM application. \n", + "\n", + "This section explores open source tools that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the LLM performance.\n", "\n", - "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance." + "We will cover open source tools that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs demonstrating how the quality of the parser can impact LLM's performance." ] }, { @@ -61,7 +60,7 @@ "source": [ "### MarkItDown\n", "\n", - "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", + "MarkItDown {cite}`microsoft2024markitdown` is a Python package and CLI tool developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", "\n", "Key features:\n", "- Simple command-line and Python API interfaces\n", @@ -81,7 +80,7 @@ "\n", "### Docling\n", "\n", - "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", + "Docling {cite}`docling2024github` is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", "\n", "Key features:\n", "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n", @@ -101,13 +100,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Frameworks-Based Parsing\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -119,17 +111,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n", + "A common use case where document parsing matters is structured data extraction, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`).\n", "\n", "\n", "```{figure} ../data/input/forecast.png\n", "---\n", "name: forecast\n", "alt: Forecast\n", - "scale: 50%\n", + "scale: 45%\n", "align: center\n", "---\n", - "Forecast\n", + "Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`\n", "```" ] }, @@ -184,7 +176,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence." + "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of similarity between two strings based on the number of matches in the longest common subsequence." ] }, { @@ -256,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively." + "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher`, respectively." ] }, { @@ -351,7 +343,7 @@ "scale: 45%\n", "align: center\n", "---\n", - "Forecast 2025\n", + "Merrill Lynch's CIO Economic Forecasts.\n", "```\n", "\n", "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n" @@ -375,7 +367,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." + "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` represents the kind of data the user would like to extract and `doc` is the input document to analyze." ] }, { @@ -682,7 +674,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure as we can see in {ref}`asset_class`. The CIO view information is represented in a spectrum starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this relatively more complex information from the document.\n", "```{figure} ../_static/input/asset_class.png\n", "---\n", "name: asset_class\n", @@ -729,7 +721,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." + "We construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart. This enables us to calculate accuracy of the structured data extraction task in case." ] }, { @@ -936,7 +928,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." + "We observe that Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract structured information. Having said that, it is important to mention that a more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates since results are non-deterministic." ] }, { @@ -945,8 +937,8 @@ "source": [ "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", "\n", - "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", - "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" + "By doing that, we observe that Docling extracted 7 tables from the document exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we display the first two and the last tables. We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" ] }, { @@ -1593,7 +1585,14 @@ "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", "\n", - "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image.\n", + "\n", + "We have covered MarkitDown and Docling as examples of open source tools that can help developers parse input data into a suitable format to LLMs. Other relevant open source tools worth mentioning include:\n", + "- Unstructured.io {cite}`unstructured2024github`: A Python library for unstructured data extraction.\n", + "- FireCrawl {cite}`mendable2024firecrawl`: A Fast and Efficient Web Crawler for LLM Training Data.\n", + "- LlamaParse {cite}`llamaparse2024github`: Llamaindex's data parsing solution.\n", + "\n", + "The choice of tool depends on the specific requirements of the application and the nature of the input data. This choice should be taken as a critical decision of any data intensive LLM-based application and deserves dedicated research and evidence-based experimentation.\n" ] }, { @@ -1602,75 +1601,152 @@ "source": [ "## Retrieval-Augmented Generation\n", "\n", - "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", + "What happens if we asked ChatGPT who's the author of the book \"Taming LLMs\"?\n", "\n", - "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 1, "metadata": {}, + "outputs": [], "source": [ - "## Case Studies\n", - "\n", - "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n", - "\n", - "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "from dotenv import load_dotenv\n", + "import os\n", "\n", - "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", "\n", - "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "question = \"Who's the Author of the Book Taming LLMs?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The book \"Taming LLMs\" is authored by *G. Arulkumaran, H. M. B. P. D. Karthikeyan, and I. A. M. Almasri.* If you need more information about the book or its contents, feel free to ask!\n" + ] + } + ], + "source": [ + "response = client.chat.completions.parse(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": question}\n", + " ]\n", + ")\n", + "response.choices[0].message.content" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study I: Content Chunking with Contextual Linking\n", + "Turns out ChatGPT hallucinates. A quick web search on the before mentioned authors yields no results. In fact, those authors names are made up. And of course the correct answer would have been \"Tharsis Souza\".\n", "\n", - "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", - "1. The LLM's inability to process long inputs to do context-size limits\n", - "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", - "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "LLMs only have access to the information they have been trained on, which of course has been fixed at a point in time. Hence, LLMs operate with stale data. The problem gets exacerbated by the fact that LLMs are trained to provide an answer even if the answer is unknown by them, hence leading to hallucinations. \n", "\n", - "Here, we exemplify this technique by following these steps:\n", - "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "One solution to this problem is to use a retrieval system to fetch information from a knowledge base to provide recent and relevant context to user queries using so-called Retrieval Augmented Generation (RAG) system.\n", "\n", - "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "RAG utilizes a retrieval system to fetch external knowledge and augment LLM's context. It is a useful technique for building LLM applications that require domain-specific information or knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`. It has also proved effective in mitigating LLMs hallucinations {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above example, a RAG would help with hallucinations by grounding the LLM's response to information provided in the knowledge base. Additional common use cases of RAG systems include:\n", "\n", - "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "1. **Enterprise Knowledge Management**: RAG enables organizations to synthesize answers from diverse internal data sources like documents, databases, and communication channels. This creates a unified knowledge interface that can accurately answer questions using the organization's own data.\n", + "2. **Document Processing and Analysis**: RAG excels at extracting and analyzing information from complex documents like financial reports, presentations, and spreadsheets. The system can enable LLMs to understand context and relationships across different document types and formats.\n", + "3. **Intelligent Customer Support**: By combining knowledge bases with conversational abilities, RAG powers chatbots and support systems that can maintain context across chat history, provide accurate responses, and handle complex customer queries while reducing hallucinations.\n", + "4. **Domain-Specific Applications**: RAG allows LLMs to be equipped with specialized knowledge in fields like medicine, law, or engineering by retrieving information from domain-specific literature, regulations, and technical documentation. This enables accurate responses aligned with professional standards and current best practices.\n", + "5. **Code Documentation and Technical Support**: RAG can help developers by retrieving relevant code examples, API documentation, and best practices from repositories and documentation, which often suffer updates frequently, enabling more accurate and contextual coding assistance.\n", "\n", - "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "If LLMs alone work on stale, general-purpose data with the added challenge of being prone to hallucinations, RAG systems serve as an added capability enabling LLMs to work on recent, domain-specific knowledge increasing the likelihood of LLMs to provide responses that are factual and relevant to user queries.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RAG Pipeline\n", "\n", - "Let's examine an example implementation of this technique.\n", + "RAG architectures vary but they all share the same goal: to retrieve relevant information from a knowledge base to maximize the LLM's ability to effectively and accurately respond to prompts, particularly when the answer requires out-of-training data information.\n", "\n", - "#### Generating long-form content\n", + "We will introduce key components of a RAG system one by one leading to a full canonical RAG pipeline at the end that ultimately will be used to answer our original question \"Who's the author of the book Taming LLMs?\", accurately.\n", "\n", - "- Goal: Generate a long-form report analyzing a company's financial statement.\n", - "- Input: A company's 10K SEC filing.\n", + "The following basic components will be introduced (see {numref}`rag_pipeline` for a visual representation):\n", + "- Vector Database\n", + " - Embeddings\n", + " - Indexing\n", + "- Retrieval System including re-ranking\n", + "- LLM Augmented Generation via in-context learning\n", "\n", - "```{figure} ../_static/structured_output/diagram1.png\n", + "Data extraction, parsing and chunking are also part of a canonical pipeline as we prepare the knowledge base. Those are concepts that we have already explored in the previous sections, hence we will be succinct here. We will start by preparing the knowledge base.\n", + "\n", + "```{figure} ../_static/input/rag.svg\n", "---\n", - "name: content-chunking-with-contextual-linking\n", - "alt: Content Chunking with Contextual Linking\n", - "scale: 50%\n", + "name: rag_pipeline\n", + "alt: RAG Pipeline\n", + "scale: 99%\n", "align: center\n", "---\n", - "Content Chunking with Contextual Linking Schematic Representation.\n", - "```\n", - "\n", - "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "Simplified RAG Pipeline\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preparing the Knowledge Base\n", "\n", - "**Step 1: Chunking the Content**\n", + "Every RAG system requires a knowledge base. In our case, the knowledge base is a set of documents that we equip the LLM to answer our authorship question.\n", "\n", - "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", - "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", - "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", - " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", - " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", - " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "Hence, we will compose our knowledge base by adding the web version of (some of the chapters of) the book \"Taming LLMs\", namely:\n", + "- Introduction\n", + "- Structured Output\n", + "- Input (this very chapter)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book_url = \"https://www.tamingllms.com/\"\n", + "chapters = [\"markdown/intro.html\",\n", + " \"notebooks/structured_output.html\",\n", + " \"notebooks/input.html\"]\n", "\n", - " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + "chapter_urls = [f\"{book_url}/{chapter}\" for chapter in chapters]\n", + "chapter_ids = [chapter.split(\"/\")[-1].replace(\".html\", \"\") for chapter in chapters]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `Docling` to download the chapters from the web and parse them as markdown files." ] }, { @@ -1679,36 +1755,57 @@ "metadata": {}, "outputs": [], "source": [ - "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", - " \"\"\"\n", - " Split input text into chunks of specified size with specified overlap.\n", + "chapters = [converter.convert(chapter_url).document.export_to_markdown() for chapter_url in chapter_urls]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to store the chapters in a vector database to enable the construction of a retrieval system." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Vector Database\n", "\n", - " Args:\n", - " text (str): The input text to be chunked.\n", - " chunk_size (int): The maximum size of each chunk in tokens.\n", - " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "Vector databases are specialized databases designed to store and retrieve high-dimensional vectors, which are mathematical representations of data like text, images, or audio. These databases are optimized for similarity search operations, making them ideal for embeddings-based retrieval systems.\n", "\n", - " Returns:\n", - " list: A list of text chunks.\n", - " \"\"\"\n", - " from langchain_text_splitters import CharacterTextSplitter\n", + "A typical pipeline involving a vector database includes the following:\n", "\n", - " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", - " return text_splitter.split_text(text)\n" + "1. Input data is converted into \"documents\" forming a collection representing our knowledge base\n", + "2. Each document is converted into an embedding which are stored in the vector database\n", + "3. Embeddings are indexed in the vector database for efficient similarity search\n", + "4. The vector database is queried to retrieve the most relevant documents\n", + "5. The retrieved documents are used to answer questions\n", + "\n", + "Vector databases are not a mandatory component of RAG systems. In fact, we can use a simple list of strings to store the chapters (or their chunks) and then use the LLM to answer questions about the document. However, vector databases are useful for RAG applications as they enable:\n", + "- Fast similarity search for finding relevant context\n", + "- Efficient storage of document embeddings\n", + "- Scalable retrieval for large document collections\n", + "- Flexible querying with metadata filters\n", + "\n", + "In that way, RAG applications can be seen as a retrieval system that uses a vector database to store and retrieve embeddings of documents, which in turn are used to augment LLMs with contextually relevant information as we will see in the next sections.\n", + "\n", + "Here, we will use ChromaDB {cite}`chromadb2024docs` as an example of an open source vector database but key features and concepts we cover are applicable to other vector databases, in general.\n", + "\n", + "ChromaDB is a popular open-source vector database that offers:\n", + "- Efficient storage and retrieval of embeddings\n", + "- Support for metadata and filtering\n", + "- Easy integration with Python applications\n", + "- In-memory and persistent storage options\n", + "- Support for multiple distance metrics\n", + "\n", + "Other notable vector databases include Weaviate, FAISS, and Milvus." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 2: Writing the Base Prompt Template**\n", - "\n", - "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", - "- `role`: Defines the role or persona the model should assume.\n", - "- `context`: Provides the background information or context for the task.\n", - "- `instruction`: Specifies the task or action the model needs to perform.\n", - "- `input_text`: Contains the actual text input that the model will process.\n", - "- `requirements`: Lists any specific requirements or constraints for the output." + "In ChromaDB, we can create a vector database client as follows." ] }, { @@ -1717,26 +1814,17 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.prompts import PromptTemplate\n", - "def get_base_prompt_template() -> str:\n", - " \n", - " base_prompt = \"\"\"\n", - " ROLE: {role}\n", - " CONTEXT: {context}\n", - " INSTRUCTION: {instruction}\n", - " INPUT: {input}\n", - " REQUIREMENTS: {requirements}\n", - " \"\"\"\n", - " \n", - " prompt = PromptTemplate.from_template(base_prompt)\n", - " return prompt" + "import chromadb\n", + "chroma_client = chromadb.Client()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + "This will create a vector database in memory. We can also create a persistent vector database by specifying a path to a directory or alternatively by using a cloud-based vector database service like AWS, Azure or GCP. We will use a vector database in memory for this example.\n", + "\n", + "Next, we create a collection to store the embeddings of the chapters. And add our chapters as documents to the collection as follows." ] }, { @@ -1745,45 +1833,19 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_community.chat_models import ChatLiteLLM\n", + "collection = chroma_client.create_collection(name=\"taming_llms\")\n", "\n", - "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", - " \"\"\"\n", - " Returns an LLMChain instance using langchain.\n", - "\n", - " Args:\n", - " prompt_template (str): The prompt template to use.\n", - " model_name (str): The name of the model to use.\n", - " temperature (float): The temperature setting for the model.\n", - "\n", - " Returns:\n", - " llm_chain: An instance of the LLMChain.\n", - " \"\"\"\n", - " \n", - " from dotenv import load_dotenv\n", - " import os\n", - "\n", - " # Load environment variables from .env file\n", - " load_dotenv()\n", - " \n", - " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", - " llm = ChatLiteLLM(\n", - " model=model_name,\n", - " temperature=temperature,\n", - " api_key=os.environ[api_key_label],\n", - " )\n", - " llm_chain = prompt_template | llm | StrOutputParser()\n", - " return llm_chain" + "collection.add(\n", + " documents=chapters,\n", + " ids=chapter_ids\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 3: Constructing Dynamic Prompt Parameters**\n", - "\n", - "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + "We are ready to query the collection. We write a simple function that takes the collection, input query and number of retrieved results as argument and returns the retrieved documents." ] }, { @@ -1792,59 +1854,19 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Dict\n", - "def get_dynamic_prompt_params(prompt_params: Dict, \n", - " part_idx: int, \n", - " total_parts: int,\n", - " chat_context: str,\n", - " chunk: str) -> str:\n", - " \"\"\"\n", - " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", - " \n", - " Args:\n", - " prompt_params (Dict): Original prompt parameters\n", - " part_idx (int): Index of current conversation part\n", - " total_parts (int): Total number of conversation parts\n", - " chat_context (str): Chat context from previous parts\n", - " chunk (str): Current chunk of text to be processed\n", - " Returns:\n", - " str: Dynamically constructed prompt template with part-specific params\n", - " \"\"\"\n", - " dynamic_prompt_params = prompt_params.copy()\n", - " # saves the chat context from previous parts\n", - " dynamic_prompt_params[\"context\"] = chat_context\n", - " # saves the current chunk of text to be processed as input\n", - " dynamic_prompt_params[\"input\"] = chunk\n", - " \n", - " # Add part-specific instructions\n", - " if part_idx == 0: # Introduction part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the Introduction part of a long report.\n", - " Don't cover any topics yet, just define the scope of the report.\n", - " \"\"\"\n", - " elif part_idx == total_parts - 1: # Conclusion part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the last part of a long report. \n", - " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", - " \"\"\"\n", - " else: # Main analysis part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", - " For this part, analyze the below INPUT.\n", - " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", - " \"\"\"\n", - " \n", - " return dynamic_prompt_params" + "def query_collection(collection, query_text, n_results=3):\n", + " results = collection.query(\n", + " query_texts=[query_text],\n", + " n_results=n_results\n", + " )\n", + " return results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "**Step 4: Generating the Report**\n", - "\n", - "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + "We write a simple query, enquiring the purpose of the book." ] }, { @@ -1853,24 +1875,907 @@ "metadata": {}, "outputs": [], "source": [ - "def generate_report(input_content: str, llm_model_name: str, \n", - " role: str, requirements: str,\n", - " chunk_size: int, chunk_overlap: int) -> str:\n", - " # stores the parts of the report, each generated by an individual LLM call\n", - " report_parts = [] \n", - " # split the input content into chunks\n", - " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", - " # initialize the chat context with the input content\n", - " chat_context = input_content\n", - " # number of parts to be generated\n", - " num_parts = len(chunks)\n", - "\n", - " prompt_params = {\n", - " \"role\": role, # user-provided\n", - " \"context\": \"\", # dinamically updated per part\n", - " \"instruction\": \"\", # dynamically updated per part\n", - " \"input\": \"\", # dynamically updated per part\n", - " \"requirements\": requirements #user-priovided\n", + "q = \"What is the purpose of this book?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print([['intro', 'input', 'structured_output']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As response, we obtain an object that contains several attributes including:\n", + "- `documents`: The actual documents retrieved from the collection, i.e. the chapters \n", + "- `ids`: The ids of the documents retrieved from the collection\n", + "- `distances`: The distances of the documents to the query vector\n", + "\n", + "We can see that the chapters \"Introduction\", \"Input\" and \"Structured Output\" are retrieved from the collection ordered by their distance to the query vector.\n", + "\n", + "We observe that the Introduction chapter is the most relevant one as it ranks first, followed by the Input and Structured Output chapters. Indeed, the purpose of the book is included in the Introduction chapter demonstrating the retrieval system successfully retrieved the most relevant document to the input query, in this simple example.\n", + "\n", + "In order to understand how the retrieval system works and how the \"distance to the query vector\" is computed, we need to understand how the embeddings are created and how the documents are indexed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Embeddings**\n", + "\n", + "Embeddings are numerical representations of data (including text, images, audio, etc.) that capture meaning, allowing machines to process data quantitatively. Each embedding can be represented as a vector of floating-point numbers such that embedded data with similar meanings produce similar, i.e. close, vectors [^embeddings_definition].\n", + "\n", + "[^embeddings_definition]: Bengio et al. {cite}`bengio2014representationlearningreviewnew` provide serves as an excellent reference for representation learning in general including embeddings. OpenAI provides a good intro to Embeddings for developers {cite}`openai2024embeddings`\n", + "\n", + "For text data, small distances among embeddings suggest high semantic relatedness and large distances suggest low semantic relatedness among the embedded texts. HuggingFace provides a leaderboard of embeddings models {cite}`huggingface2024mteb`, which are ranked by in dimensions such as classification, clustering and reranking performance.\n", + "\n", + "Behind the scenes, ChromaDB is using the model `all-MiniLM-L6-v2` by default [^chroma_embeddings] to create embeddings for the input documents and the query (see {numref}`embedding`). This model is available in `sentence_transformers` {cite}`sentencetransformers2024website`. Let's see how it works.\n", + "\n", + "```{figure} ../_static/input/embedding.svg\n", + "---\n", + "name: embedding\n", + "alt: Embedding\n", + "scale: 70%\n", + "align: center\n", + "---\n", + "Embedding\n", + "```\n", + "\n", + "[^chroma_embeddings]: ChromaDB enables custom embedding functions and provides a list of wrappers around commonly used embedding models and APIs https://docs.trychroma.com/docs/embeddings/embedding-functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "embedding_model = SentenceTransformer('all-MiniLM-L6-v2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We replicate what ChromaDB did by embedding our chapters as well as input query using sentence transformers." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 384)\n" + ] + } + ], + "source": [ + "q = \"What is the purpose of this book?\"\n", + "docs_to_embed = [q] + chapters\n", + "embeddings = embedding_model.encode(docs_to_embed)\n", + "print(embeddings.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain four 384-dimensional vectors representing our embeddings (one for each of the three chapters and one for the input query).\n", + "\n", + "Now we can calculate similarity among the embeddings. By default, sentence transformers uses cosine similarity to calculate the similarity between embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarities = embedding_model.similarity(embeddings, embeddings)\n", + "similarities" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "tensor([[1.0000, 0.4402, 0.3022, 0.4028],\n", + " [0.4402, 1.0000, 0.6606, 0.5807],\n", + " [0.3022, 0.6606, 1.0000, 0.6313],\n", + " [0.4028, 0.5807, 0.6313, 1.0000]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize the similarity matrix to better understand the relationships between our documents in {numref}`similarities`. The top row of the matrix represents the similarity of the input query against all chapters. That's exactly what we previously obtained by querying ChromaDB which returned a response with documents ranked by similarity to input query.\n", + "\n", + "```{figure} ../_static/input/similarity.png\n", + "---\n", + "name: similarities\n", + "alt: Similarity matrix heatmap\n", + "scale: 90%\n", + "align: center\n", + "---\n", + "Similarity matrix heatmap showing relationships among query and chapters.\n", + "``` \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculating similarity among embeddings can become computationally intensive if brute force is used, i.e. pair-wise computation, as the number of documents grows in the knowledge base. Indexing is a technique to help address this challenge." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Indexing**\n", + "\n", + "Indexing is a crucial optimization technique that makes similarity searches faster and more efficient.\n", + "\n", + "Without indexing, finding similar vectors would require an exhaustive search - comparing a query vector against every single vector in the database. For large datasets, this becomes prohibitively slow.\n", + "\n", + "Common indexing strategies include:\n", + "\n", + "1. **Tree-based Indexes**\n", + " - Examples include KD-trees and Ball trees\n", + " - Work by partitioning the vector space into hierarchical regions\n", + " - Effective for low-dimensional data but suffer from the \"curse of dimensionality\"\n", + "\n", + "2. **Graph-based Indexes**\n", + " - HNSW (Hierarchical Navigable Small World) is a prominent example\n", + " - Creates a multi-layered graph structure for navigation\n", + " - Offers excellent search speed but requires more memory\n", + "\n", + "3. **LSH (Locality-Sensitive Hashing)**\n", + " - Uses hash functions that map similar vectors to the same buckets\n", + " - More memory-efficient than graph-based methods\n", + " - May sacrifice some accuracy for performance\n", + "\n", + "4. **Quantization-based Indexes**\n", + " - Product Quantization compresses vectors by encoding them into discrete values\n", + " - Reduces memory footprint significantly\n", + " - Good balance between accuracy and resource usage\n", + "\n", + "HNSW is the underlying library for Chroma vector indexing and search {cite}`chromadb2024hnsw`. HNSW provides fast searches with high accuracy but uses more memory. LSH and quantization methods offer better memory efficiency but may sacrifice some precision.\n", + "\n", + "But are indexing + basic embeddings based similarity sufficient? Often not, as we will see next as we cover reranking technique." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reranking\n", + "\n", + "Let's go back to querying our vector database. Here are additional examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we write a query about how to get structured output from LLMs. Successfully retrieving the \"Structured Output\" chapter from the book as top result." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['structured_output', 'input', 'intro']]\n" + ] + } + ], + "source": [ + "q = \"How to get structured output from LLMs?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we would like to obtain a tutorial on `Docling`, a tool we covered in this very chapter. However, we fail to obtain the correct chapter and instead obtain the \"Introduction\" chapter as a result." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['intro', 'input', 'structured_output']]\n" + ] + } + ], + "source": [ + "q = \"Docling tutorial\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval systems solely based on vector similarity search might miss semantic relevance. That brings the need for techniques that can improve accuracy of the retrieval system. One such technique is re-ranking.\n", + "\n", + "Re-ranking is a method that can improve accuracy of the retrieval system by re-ranking the retrieved documents based on their relevance to the input query.\n", + "\n", + "In the following, we will use the `sentence_transformers` library to re-rank the retrieved documents based on their relevance to the input query. We utilize the `CrossEncoder` model to re-rank the documents. Cross-Encoder models are more accurate at judging relevance at the cost of speed compared to basic vector-based similarity. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can implement a reranking step in a RAG system using a Cross-Encoder model in the following steps:\n", + "\n", + "1. First, we initialize the Cross-Encoder model:\n", + "```python\n", + "model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)\n", + "```\n", + "- Uses the `ms-marco-MiniLM-L-6-v2` model, which is specifically trained for passage reranking\n", + "- Sets a maximum sequence length of 512 tokens\n", + "- This model is designed to score the relevance between query-document pairs\n", + "\n", + "2. Then we perform the reranking:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "- Creates pairs of (query, document) for each retrieved document\n", + "- The model predicts relevance scores for each pair\n", + "- Higher scores indicate better semantic match between query and document\n", + "\n", + "3. Finally, we select the best match:\n", + "```python\n", + "print(res[\"documents\"][0][np.argmax(scores)])\n", + "```\n", + "- `np.argmax(scores)` finds the index of the highest scoring document\n", + "- Uses that index to retrieve the most relevant document\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We obtain the following scores for the retrieved documents (\"intro\", \"input\", \"structured_output\"), the higher the score, the more relevant the document is in relation to the input query.\n", + "\n", + "```\n", + "array([-8.52623 , -6.328738, -8.750055], dtype=float32)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain the index of the highest scoring document, which corresponds to the \"input\" chapter. Hence, the re-ranking step successfully retrieved the correct chapter." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input\n" + ] + } + ], + "source": [ + "print(res[\"ids\"][0][np.argmax(scores)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ideia is to first run semantic similarity on embeddings, which should be fast but potentially inaccurate, and then run re-raking on the top-k results, which is more accurate but slower. By doing so, we can balance the speed and accuracy of the retrieval system.\n", + "\n", + "Hence, instead of going over all retrieved documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "We would run reranking on the TOPK results, where TOPK <<< number of documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### LLMs with RAG\n", + "\n", + "We are finally ready to use the retrieval system to help the LLM answer our authorship question. A common way to integrate RAGs with LLMs is via in-context learning. With in-context learning the LLM learns from the retrieved documents by providing them in the context window as represented in {numref}`incontext`. This is accomplished via a prompt template structure as follows.\n", + "\n", + "```{figure} ../_static/input/incontext.svg\n", + "---\n", + "name: incontext\n", + "alt: In-Context Learning\n", + "scale: 95%\n", + "align: center\n", + "---\n", + "RAG LLM with In-Context Learning\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + " rag_system_prompt_template = f\"\"\"\n", + " You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + "\n", + " user_prompt_template = f\"\"\"\n", + " QUESTION: {input}\n", + " \"\"\"\n", + "```\n", + "\n", + "This prompt strategy demonstrates a common in-context learning pattern where retrieved documents are incorporated into the LLM's context to enhance response accuracy and relevance. The prompt structure typically consists of a system prompt that:\n", + "- Sets clear boundaries for the LLM to use information from the provided context\n", + "- Includes the retrieved documents as context\n", + "\n", + "This approach:\n", + "- Reduces hallucination by grounding responses in source documents\n", + "- Improves answer relevance by providing contextually relevant information to the LLM\n", + "\n", + "The context variable is typically populated with the highest-scoring document(s) from the retrieval step, while the input variable contains the user's original query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def RAG_qa(client, model, context, input):\n", + " \"\"\"\n", + " Generate a summary of input using a given model\n", + " \"\"\"\n", + " rag_system_prompt_template = f\"\"\"You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + " \n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": \"system\", \"content\": rag_system_prompt_template},\n", + " {\"role\": \"user\", \"content\": f\"QUESTION: {input}\"}]\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we set the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", + "\n", + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we run the retrieve step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = query_collection(collection, q)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we run the re-ranking step setting it to consider the `TOPK` retrieved documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TOPK = 2\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "res_reranked = res[\"documents\"][0][np.argmax(scores)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then pass the top document as context and invoke the LLM with our RAG-based template leading to a successful response." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The author of the book \"Taming LLMs\" is Tharsis Souza.\n" + ] + } + ], + "source": [ + "answer = RAG_qa(model, res_reranked[0], question)\n", + "answer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, we motivated the use of RAGs as a tool to equip LLMs with relevant context and provided a canonical implementation of its core components. RAGs, however, can be implemented in many shapes and forms and entire books have been written about them. We point the user to additional resources if more specialized techniques and architectures are needed {cite}`kimothi2024simpleguiderag, athinaai2024ragcookbooks, diamant2024ragtechniques, hands-on-llms-book`.\n", + "\n", + "Next, we discuss RAGs challenges and limitations and conclude our RAGs section envisioning the future of RAGs challenged by the rise of long-context language models.\n", + "\n", + "### Challenges and Limitations\n", + "\n", + "While RAG systems offer powerful capabilities for enhancing LLM responses with external knowledge, they face several significant challenges and limitations that require careful consideration:\n", + " \n", + "- **Data Quality and Accuracy**: The effectiveness of RAG systems fundamentally depends on the quality and reliability of their knowledge sources. When these sources contain inaccurate, outdated, biased, or incomplete information, the system's responses become unreliable. This challenge is particularly acute when dealing with rapidly evolving topics or when sourcing information from unverified channels.\n", + " \n", + "- **Computational Cost and Latency**: Implementing RAG systems at scale presents computational and operational challenges. The process of embedding documents, maintaining vector databases, and performing similarity searches across large knowledge bases demands computational, budget and operational resources. In real-time applications, these requirements can introduce noticeable latency, potentially degrading the user experience and limiting practical applications.\n", + " \n", + "- **Explainability and Evaluation**: The complexity of RAG systems, arising from the intricate interaction between retrieval mechanisms and generative models, makes it difficult to trace and explain their reasoning processes. Traditional evaluation metrics often fail to capture the nuanced aspects of RAG performance, such as contextual relevance and factual consistency. This limitation hampers both system improvement and stakeholder trust. Readers are encouraged to read Chapter {ref}`evals` for general LLM evaluation issues as well as consider tools such as Ragas {cite}`ragas2024evaluation` for RAG evaluation.\n", + " \n", + "- **Hallucination Management**: Though RAG systems help ground LLM responses in source documents, they do not completely eliminate hallucinations. The generative component may still produce content that extrapolates beyond or misinterprets the retrieved context. This risk becomes particularly concerning when the system confidently presents incorrect information with apparent source attribution.\n", + "\n", + "\n", + "Moreover, recent research has shed light on critical limitations of key techniques used in RAGs systems. A relevant finding pertains to reranking, which has shown {cite}`jacob2024drowningdocumentsconsequencesscaling`:\n", + "\n", + "- **Diminishing Returns**: Performance degrades as the number of documents (K) increases, sometimes performing worse than basic retrievers when dealing with large datasets.\n", + "- **Poor Document Discrimination**: Rerankers can be misled by irrelevant documents, sometimes assigning high scores to content with minimal relevance to the query.\n", + "- **Consistency Issues**: Performance and relative rankings between different rerankers can vary significantly depending on the number of documents being processed.\n", + "\n", + "### Will RAGs exist in the future?\n", + "\n", + "This question is posed as we contrast RAGs with LLMs with long-context windows (LC).\n", + "\n", + "Recent research has shed light on this specific point {cite}`li2024retrievalaugmentedgenerationlongcontext`, suggesting that, on the one hand, RAGs can be seen as a cost-effective alternative to LC models:\n", + "* RAGs offer lower computational cost compared to LC due to the significantly shorter input length required for processing.\n", + "* This cost-efficiency arises because RAG reduces the number of input tokens to LLMs, which of course reduces usage cost as pricing is based on the number of input (and output) tokens.\n", + "\n", + "On the other hand, this RAG benefit is achieved at the cost of performance:\n", + "* Recent advancements in LLMs, in particular with Gemini-1.5 and GPT-4o models, demonstrate capabilities in understanding long contexts directly, which enables them to outperform RAG in terms of average performance\n", + "* LC models can process extremely long contexts, such as Gemini 1.5 which can handle up to 1 million tokens, and these models benefit from large-scale pretraining to develop strong long-context capabilities.\n", + "\n", + "This cost-performance trade-off is illustrated in {numref}`LC`, where LC models outperform RAGs in terms of average performance while RAGs are more cost-effective.\n", + "\n", + "```{figure} ../_static/input/LC.png\n", + "---\n", + "name: LC\n", + "alt: Long-Context LLMs for Superior Performance\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Long-Context LLMs demonstrate superior performance while RAGs are more cost-effective {cite}`li2024retrievalaugmentedgenerationlongcontext`.\n", + "```\n", + "\n", + "{numref}`LC` also shows a model called \"SELF-ROUTE\" which combines RAG and LC by routing queries based on model self-reflection. This is a hybrid approach that reduces computational costs while maintaining performance comparable to LC. The advantage of SELF-ROUTE is most significant for smaller values of *k*, where *k* is the number of retrieved text chunks, and SELF-ROUTE shows a marked improvement in performance over RAG, while as k increases the performance of RAG and SELF-ROUTE approaches that of LC.\n", + "\n", + "Another example of a hybrid approach that combines the benefits of both LC and RAGs is RetroLLM {cite}`li2024retrollmempoweringlargelanguage`, which is a unified framework that integrates retrieval and generation into a single process, enabling language models to generate fine-grained evidence directly from a corpus. The key contribution is that this approach delivers those benefits while eliminating the need for a separate retriever, addressing limitations of traditional RAG methods. Experimental results demonstrate RetroLLM's superior performance compared to traditional RAG methods, across both in-domain and out-of-domain tasks. It also achieves a significant reduction in token consumption due to its fine-grained evidence retrieval.\n", + "\n", + "A relevant development in this area is the introduction of LOFT {cite}`lee2024longcontextlanguagemodelssubsume`, a benchmark to assess this paradigm shift from RAGs to LCs, using real-world tasks requiring context up to millions of tokens. Evidence suggests LCs can deliver performance with simplified pipelines compared to RAGs, particularly for tasking requiring multi-hop reasoning over long contexts when using Chain-of-Thought {cite}`wei2023chainofthoughtpromptingelicitsreasoning`. However, LCs can still be outperformed by specialized retrievers, in particular Gecko, a specialized model fine-tuned on extensive text retrieval and similarity tasks.\n", + "\n", + "Bottom-line: Do we really need RAGs? The answer is conditional:\n", + "\n", + "* **RAG may be relevant when cost-effectiveness is a key requirement** and where the model needs to access vast amounts of external knowledge without incurring high computational expenses. However, as LLMs context window sizes increase and LLMs cost per input token is decreases, RAG may not be as relevant as it was before.\n", + "* **Long-context LLMs are superior when performance is the primary concern**, and the model needs to handle extensive texts that require deep contextual understanding and reasoning.\n", + "* **Hybrid approaches like SELF-ROUTE are valuable as they combine the strengths of RAG and LC** offering a practical balance between cost and performance, especially for applications where both factors are critical.\n", + "\n", + "Ultimately, the choice between RAG, LC, or a hybrid method depends on the specific requirements of the task, available resources, and the acceptable trade-off between cost and performance.\n", + "\n", + "In a later case study, we demonstrate the power of LCs as we construct a Quiz generator with citations over a large knowledge base without the use of chunking nor RAGs.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A Note on Frameworks\n", + "\n", + "We have covered a few open source tools for parsing data and provided a canonical RAG pipeline directly using an open source VectorDB together with an LLM. There is a growing number of frameworks that offer similar functionality wrapping the same core concepts at a higher level of abstraction. The two most popular ones are `Langchain` and `LlamaIndex`. \n", + "\n", + "For instance, the code below shows how to use `LlamaIndex`'s `LlamaParse` for parsing input documents, which offers support for a wide range of file formats (e.g. .pdf, .pptx, .docx, .xlsx, .html). We we can see that the code is very similar to the one we used for `MarkitDown` and `Docling`.\n", + "\n", + "```python\n", + "from llama_parse import LlamaParse\n", + "\n", + "# Initialize the parser\n", + "parser = LlamaParse(\n", + " api_key=\"llx-your-api-key-here\",\n", + " result_type=\"markdown\", # Can be \"markdown\" or \"text\"\n", + " verbose=True\n", + ")\n", + "\n", + "documents = parser.load_data([\"./doc1.pdf\", \"./doc2.pdf\"])\n", + "```\n", + "\n", + "\n", + "\n", + "As another example, the code below replicates our ChromaDB-based retrieval system using `LlamaIndex` {cite}`llamaindex2024storing`.\n", + "\n", + "As we can see, similar concepts are used in both frameworks:\n", + "- Documents to represent elements of the knowledge base\n", + "- Collections to store the documents\n", + "- Indexing of embeddings in the VectorDB and finally\n", + "- Querying the VectorDB to retrieve the documents\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import chromadb\n", + "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", + "from llama_index.vector_stores.chroma import ChromaVectorStore\n", + "from llama_index.core import StorageContext\n", + "\n", + "# load some documents\n", + "documents = SimpleDirectoryReader(\"./data\").load_data()\n", + "\n", + "# initialize client, setting path to save data\n", + "db = chromadb.PersistentClient(path=\"./chroma_db\")\n", + "\n", + "# create collection\n", + "chroma_collection = db.get_or_create_collection(\"tamingllms\")\n", + "\n", + "# assign chroma as the vector_store to the context\n", + "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "# create your index\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")\n", + "\n", + "# create a query engine and query\n", + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Who is the author of Taming LLMs?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Frameworks are useful for quickly prototyping RAG systems and for building applications on top of them as they provide a higher level of abstraction and integration with third-party libraries. However, the underlying concepts are the same as the ones we have covered in this chapter. More often than not, problems arise when developers either do not understand the underlying concepts or fail to understand the details of the implement behind the abstractions provided by the framework. Therefore, it is recommended to try and start your implementation using lower level tools as much as possible and only when (i) the underlying problem as well as (ii) the desired solution are well understood, then consider moving to higher level frameworks if really needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Studies\n", + "\n", + "This section presents two case studies to complement topics we have covered in this chapter in the context of managing input data for LLMs.\n", + "\n", + "First, we cover content chunking, in particular Content Chunking with Contextual Linking which showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "\n", + "Second, we build a Quiz generator with citations using long context window. Not all knowledge intense applications require RAGs. In this case study, we show how to use long context window as well as some additional input management techniques such as prompt caching for efficiency and reference management to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study I: Content Chunking with Contextual Linking\n", + "\n", + "Content chunking is commonly used to breakdown long-form content into smaller, manageable chunks. In the context of RAGs, this can be helpful not only to help the retrieval system find more contextually relevant documents but also lead to a more cost efficient LLM solution since fewer tokens are processed in the context window. Furthermore, semantic chunking can increase accuracy of RAG systems {cite}`zenml2024rag`.\n", + "\n", + "Content chunking with contextual linking is a chunking technique that seeks to split input content while keeping chunk-specific context, hence allowing the LLM to maintain coherence and context when generating responses per chunks. In that way, this technique tackles two key problems:\n", + "1. The LLM's inability to process long inputs to do context-size limits\n", + "2. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "\n", + "As a consequence, a third problem is also tackled: LLM's inability to generate long-form content due to the `max_output_tokens` limitation. Since we generate responses per chunk, as we will see later, we end up with a solution that is capable of generating long-form content while maintaining coherence.\n", + "\n", + "We exemplify this technique by following these steps:\n", + "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "\n", + "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "\n", + "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "\n", + "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "\n", + "Let's examine an example implementation of this technique.\n", + "\n", + "#### Generating long-form content\n", + "\n", + "- Goal: Generate a long-form report analyzing a company's financial statement.\n", + "- Input: A company's 10K SEC filing.\n", + "\n", + "```{figure} ../_static/structured_output/diagram1.png\n", + "---\n", + "name: content-chunking-with-contextual-linking\n", + "alt: Content Chunking with Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Content Chunking with Contextual Linking Schematic Representation.\n", + "```\n", + "\n", + "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "\n", + "**Step 1: Chunking the Content**\n", + "\n", + "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", + "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", + "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", + " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", + " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", + " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "\n", + " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", + " \"\"\"\n", + " Split input text into chunks of specified size with specified overlap.\n", + "\n", + " Args:\n", + " text (str): The input text to be chunked.\n", + " chunk_size (int): The maximum size of each chunk in tokens.\n", + " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "\n", + " Returns:\n", + " list: A list of text chunks.\n", + " \"\"\"\n", + " from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " return text_splitter.split_text(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 2: Writing the Base Prompt Template**\n", + "\n", + "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", + "- `role`: Defines the role or persona the model should assume.\n", + "- `context`: Provides the background information or context for the task.\n", + "- `instruction`: Specifies the task or action the model needs to perform.\n", + "- `input_text`: Contains the actual text input that the model will process.\n", + "- `requirements`: Lists any specific requirements or constraints for the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.prompts import PromptTemplate\n", + "def get_base_prompt_template() -> str:\n", + " \n", + " base_prompt = \"\"\"\n", + " ROLE: {role}\n", + " CONTEXT: {context}\n", + " INSTRUCTION: {instruction}\n", + " INPUT: {input}\n", + " REQUIREMENTS: {requirements}\n", + " \"\"\"\n", + " \n", + " prompt = PromptTemplate.from_template(base_prompt)\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_community.chat_models import ChatLiteLLM\n", + "\n", + "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", + " \"\"\"\n", + " Returns an LLMChain instance using langchain.\n", + "\n", + " Args:\n", + " prompt_template (str): The prompt template to use.\n", + " model_name (str): The name of the model to use.\n", + " temperature (float): The temperature setting for the model.\n", + "\n", + " Returns:\n", + " llm_chain: An instance of the LLMChain.\n", + " \"\"\"\n", + " \n", + " from dotenv import load_dotenv\n", + " import os\n", + "\n", + " # Load environment variables from .env file\n", + " load_dotenv()\n", + " \n", + " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", + " llm = ChatLiteLLM(\n", + " model=model_name,\n", + " temperature=temperature,\n", + " api_key=os.environ[api_key_label],\n", + " )\n", + " llm_chain = prompt_template | llm | StrOutputParser()\n", + " return llm_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 3: Constructing Dynamic Prompt Parameters**\n", + "\n", + "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "def get_dynamic_prompt_params(prompt_params: Dict, \n", + " part_idx: int, \n", + " total_parts: int,\n", + " chat_context: str,\n", + " chunk: str) -> str:\n", + " \"\"\"\n", + " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", + " \n", + " Args:\n", + " prompt_params (Dict): Original prompt parameters\n", + " part_idx (int): Index of current conversation part\n", + " total_parts (int): Total number of conversation parts\n", + " chat_context (str): Chat context from previous parts\n", + " chunk (str): Current chunk of text to be processed\n", + " Returns:\n", + " str: Dynamically constructed prompt template with part-specific params\n", + " \"\"\"\n", + " dynamic_prompt_params = prompt_params.copy()\n", + " # saves the chat context from previous parts\n", + " dynamic_prompt_params[\"context\"] = chat_context\n", + " # saves the current chunk of text to be processed as input\n", + " dynamic_prompt_params[\"input\"] = chunk\n", + " \n", + " # Add part-specific instructions\n", + " if part_idx == 0: # Introduction part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the Introduction part of a long report.\n", + " Don't cover any topics yet, just define the scope of the report.\n", + " \"\"\"\n", + " elif part_idx == total_parts - 1: # Conclusion part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the last part of a long report. \n", + " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", + " \"\"\"\n", + " else: # Main analysis part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", + " For this part, analyze the below INPUT.\n", + " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", + " \"\"\"\n", + " \n", + " return dynamic_prompt_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Step 4: Generating the Report**\n", + "\n", + "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_report(input_content: str, llm_model_name: str, \n", + " role: str, requirements: str,\n", + " chunk_size: int, chunk_overlap: int) -> str:\n", + " # stores the parts of the report, each generated by an individual LLM call\n", + " report_parts = [] \n", + " # split the input content into chunks\n", + " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", + " # initialize the chat context with the input content\n", + " chat_context = input_content\n", + " # number of parts to be generated\n", + " num_parts = len(chunks)\n", + "\n", + " prompt_params = {\n", + " \"role\": role, # user-provided\n", + " \"context\": \"\", # dinamically updated per part\n", + " \"instruction\": \"\", # dynamically updated per part\n", + " \"input\": \"\", # dynamically updated per part\n", + " \"requirements\": requirements #user-priovided\n", " }\n", "\n", " # get the LLMChain with the base prompt template\n", @@ -2076,14 +2981,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study II: Github RAG\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Case Study III: Quiz Generation with Citations\n", + "### Case Study II: Quiz Generation with Citations\n", "\n", "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n", "\n", @@ -2400,7 +3298,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Conclusion" + "## Conclusion\n", + "\n", + "This chapter has explored critical strategies and techniques for managing input data in LLM applications, focusing on three key areas: data parsing, retrieval augmentation, and practical implementation patterns. We examined how parsing tools like MarkItDown and Docling can transform diverse data formats into LLM-compatible representations, demonstrating through case studies how parser quality can impact LLM performance. The chapter also investigated retrieval augmentation techniques, particularly RAG systems, showing how they can enhance LLM capabilities by providing access to external knowledge while discussing their future relevance in the context of emerging long-context language models.\n", + "\n", + "Through our case studies, we demonstrated practical approaches to handling common challenges in LLM applications. The Content Chunking with Contextual Linking case study illustrated techniques for managing long-form content generation while maintaining coherence across chunks. The Quiz Generation with Citations case study showcased how long-context windows can be effectively utilized without the need for complex retrieval systems, highlighting the importance of choosing the right approach based on specific application requirements rather than defaulting to more complex solutions.\n", + "\n", + "As the field continues to evolve, the choice between traditional RAG systems and emerging long-context models will likely become increasingly nuanced. While RAGs offer cost-effective solutions for incorporating external knowledge, the rise of long-context models suggests a future where simpler architectures might suffice for many applications. The key insight is that effective input data management requires careful consideration of trade-offs among complexity, cost, and performance, always guided by specific application requirements rather than following a one-size-fits-all approach. Success in building robust LLM applications will depend on understanding these trade-offs and selecting appropriate strategies for each use case." ] }, { diff --git a/tamingllms/_build/html/_static/evals/llm_judge.svg b/tamingllms/_build/html/_static/evals/llm_judge.svg deleted file mode 100644 index 4292dfa..0000000 --- a/tamingllms/_build/html/_static/evals/llm_judge.svg +++ /dev/null @@ -1,879 +0,0 @@ -LLM Judge Evaluation SystemLLM-Judgecomponentsapps
App Rankings
-Detailed Scores
-Analysis Report
-
-
Task description
-Scoring guidelines
-Output format
-
-
(Optional)Ground TruthLLM App 1LLM App 2

...

-
LLM App N Generate EvaluationPrompt Compare ResultsSubmit for Review - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/tamingllms/_build/html/_static/evals/llmjudge.d2 b/tamingllms/_build/html/_static/evals/llmjudge.d2 index 4e178ae..7f0d25e 100644 --- a/tamingllms/_build/html/_static/evals/llmjudge.d2 +++ b/tamingllms/_build/html/_static/evals/llmjudge.d2 @@ -14,6 +14,22 @@ container: { label: "LLM-Judge" } + # Evaluation results + results: { + score: Score { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + } + + explanation: Explanation { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + label: "Reasoning for score" + } + } + # Evaluation components section components: { prompt: Evaluation Prompt { @@ -51,9 +67,7 @@ container: { style.stroke: "#3498DB" } - dots: |md - ... - | + dots: "..." appN: LLM App N { shape: rectangle @@ -62,22 +76,9 @@ container: { } } - # Output section - output: Evaluation Results { - shape: page - style.fill: "#EAFAF1" - style.stroke: "#2ECC71" - label: |md - ``` - App Rankings - Detailed Scores - Analysis Report - ``` - | - } - # Connections between components - base_llm -> output: Generate Evaluation + base_llm -> results.score + base_llm -> results.explanation components.prompt -> base_llm: Prompt components.reference -> base_llm: Compare Results { @@ -89,4 +90,4 @@ container: { apps.appN -> base_llm } -direction: right +direction: right \ No newline at end of file diff --git a/tamingllms/_build/html/_static/evals/meta2.svg b/tamingllms/_build/html/_static/evals/meta2.svg deleted file mode 100644 index 8833843..0000000 --- a/tamingllms/_build/html/_static/evals/meta2.svg +++ /dev/null @@ -1,882 +0,0 @@ -LLM Judge Pairwise Evaluation SystemPool of LLM JudgesPairwiseSelectorllmcomparison_pairHumanEvaluatorsRankingAlgorithm
LLM Judges Leaderboard
----------------------
-1. Judge C (0.95)
-2. Judge A (0.92)
-3. Judge B (0.89)
-   ...
-N. Judge X (0.75)
-
-
PromptLLM ResponseJudge AvsJudge B Draw JudgesGenerate PairInput forEvaluationEvaluatePreferencesGenerateRankings - - - - - - - - - - - - - - - - - - - - -
\ No newline at end of file diff --git a/tamingllms/_build/html/_static/input/LC.png b/tamingllms/_build/html/_static/input/LC.png new file mode 100644 index 0000000..72602d1 Binary files /dev/null and b/tamingllms/_build/html/_static/input/LC.png differ diff --git a/tamingllms/_build/html/_static/input/embedding.d2 b/tamingllms/_build/html/_static/input/embedding.d2 new file mode 100644 index 0000000..e9185c4 --- /dev/null +++ b/tamingllms/_build/html/_static/input/embedding.d2 @@ -0,0 +1,38 @@ +container: { + shape: rectangle + style.stroke: "#D5DBDB" + style.stroke-width: 2 + style.fill: "#F7FBFF" + + input: "Who is the Author of..." { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + style.font-color: "#2ECC71" + } + + model: { + shape: rectangle + style.fill: "#FEF9E7" + style.stroke: "#F4D03F" + + network: "all-MiniLM-L6-v2" { + style.font-size: 24 + } + } + + output: "[0.123, 0.456, 0.789, ...]" { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#E74C3C" + style.font-color: "#E74C3C" + } + + # Connections + input -> model -> output + + # Label below model + label: "Embedding" +} + +direction: right diff --git a/tamingllms/_build/html/_static/input/embedding.svg b/tamingllms/_build/html/_static/input/embedding.svg new file mode 100644 index 0000000..adbe91b --- /dev/null +++ b/tamingllms/_build/html/_static/input/embedding.svg @@ -0,0 +1,118 @@ + + + + + + + + +EmbeddingWho is the Author of...model[0.123, 0.456, 0.789, ...]all-MiniLM-L6-v2 + + + + + + + \ No newline at end of file diff --git a/tamingllms/_build/html/_static/input/incontext.svg b/tamingllms/_build/html/_static/input/incontext.svg new file mode 100644 index 0000000..82c636f --- /dev/null +++ b/tamingllms/_build/html/_static/input/incontext.svg @@ -0,0 +1,4 @@ + + + +
Retrieval
Retrieval
RAG Context
RAG Context
reranking
reranking
Query
Query

LLM

LLM

Context Window

Context Wi...
Retrieval System
Retrieval System
VectorDB
VectorDB
\ No newline at end of file diff --git a/tamingllms/_build/html/_static/input/incontext.xml b/tamingllms/_build/html/_static/input/incontext.xml new file mode 100644 index 0000000..1a15d1d --- /dev/null +++ b/tamingllms/_build/html/_static/input/incontext.xml @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tamingllms/_build/html/_static/input/rag.svg b/tamingllms/_build/html/_static/input/rag.svg new file mode 100644 index 0000000..6b77e28 --- /dev/null +++ b/tamingllms/_build/html/_static/input/rag.svg @@ -0,0 +1,4 @@ + + + +
Data Parsing & Ingestion
Data
Embeddings
Retrieval
RAG Context
reranking
Query

LLM

Context Window

Indexing
Query
User
VectorDB
Retrieval System
RAG
\ No newline at end of file diff --git a/tamingllms/_build/html/_static/input/rag.xml b/tamingllms/_build/html/_static/input/rag.xml new file mode 100644 index 0000000..7c6a681 --- /dev/null +++ b/tamingllms/_build/html/_static/input/rag.xml @@ -0,0 +1,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tamingllms/_build/html/_static/input/similarity.png b/tamingllms/_build/html/_static/input/similarity.png new file mode 100644 index 0000000..4f2f228 Binary files /dev/null and b/tamingllms/_build/html/_static/input/similarity.png differ diff --git a/tamingllms/_build/html/_static/input/vectordb.png b/tamingllms/_build/html/_static/input/vectordb.png new file mode 100644 index 0000000..da956ce Binary files /dev/null and b/tamingllms/_build/html/_static/input/vectordb.png differ diff --git a/tamingllms/_build/html/markdown/intro.html b/tamingllms/_build/html/markdown/intro.html index 8f56885..47d48bd 100644 --- a/tamingllms/_build/html/markdown/intro.html +++ b/tamingllms/_build/html/markdown/intro.html @@ -298,10 +298,12 @@

2.2. A Practical Approach

@@ -442,7 +444,7 @@

2.10. About the Author

-

Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University’s Master of Science program in Applied Analytics, (incoming) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI1 ecosystem.

+

Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University’s Master of Science program in Applied Analytics, (incoming) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI ecosystem.

With over 15 years of experience delivering technology products across startups and Fortune 500 companies, he is also an author of numerous scholarly publications and a frequent speaker at academic and business conferences. Grounded on academic background and drawing from practical experience building and scaling up products powered by language models at early-stage startups, major institutions as well as contributing to open source projects, he brings a unique perspective on bridging the gap between LLMs promised potential and their practical implementation challenges to enable the next generation of AI-powered products.

diff --git a/tamingllms/_build/html/markdown/preface.html b/tamingllms/_build/html/markdown/preface.html index 310ff82..3fb2de8 100644 --- a/tamingllms/_build/html/markdown/preface.html +++ b/tamingllms/_build/html/markdown/preface.html @@ -245,7 +245,7 @@

1. Preface—Emanuel Derman

-

An alternative title of this book could have been “Language Models Behaving Badly”. If you come from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

+

An alternative title of this book could have been “Language Models Behaving Badly”. If you come from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

The book “Models.Behaving.Badly” by Emanuel Derman, a former physicist and Goldman Sachs quant, explores how financial and scientific models can fail when we mistake them for reality rather than treating them as approximations full of assumptions. The core premise of his work is that while models can be useful tools for understanding aspects of the world, they inherently involve simplification and assumptions. Derman argues that many financial crises, including the 2008 crash, occurred in part because people put too much faith in mathematical models without recognizing their limitations.

Like financial models that failed to capture the complexity of human behavior and market dynamics, LLMs have inherent constraints. They can hallucinate facts, struggle with logical reasoning, and fail to maintain consistency in long outputs. Their responses, while often convincing, are probabilistic approximations based on training data rather than true understanding, even though humans insist on treating them as “machines that can reason”.

@@ -253,7 +253,7 @@

1. Preface -
+
[Der11]

E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.

diff --git a/tamingllms/_build/html/markdown/toc.html b/tamingllms/_build/html/markdown/toc.html index b4a1151..c42dbe7 100644 --- a/tamingllms/_build/html/markdown/toc.html +++ b/tamingllms/_build/html/markdown/toc.html @@ -272,6 +272,15 @@

Chapter 8: Frontiers

Appendix A: Tools and Resources

CC BY-NC-SA 4.0

+
@misc{tharsistpsouza2024tamingllms,
+  author = {Tharsis T. P. Souza},
+  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},
+  year = {2024},
+  journal = {GitHub repository},
+  url = {https://github.com/souzatharsis/tamingLLMs)
+}
+
+
diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html index 0f5e9d8..2b8bd61 100644 --- a/tamingllms/_build/html/notebooks/alignment.html +++ b/tamingllms/_build/html/notebooks/alignment.html @@ -260,7 +260,7 @@
-

7. Preference-Based Alignment

+

7. Preference-Based Alignment

A people that values its privileges above its principles soon loses both.

—Dwight D. Eisenhower

@@ -268,69 +268,69 @@
-

7.1. Introduction

+

7.1. Introduction

The release of ChatGPT 3.5 in late 2022 marked a significant moment in the history of artificial intelligence. Within just five days of its launch, the model attracted over a million users, and within two months, it became the fastest-growing consumer application in history with over 100 million monthly active users.

Yet, this raises an intriguing question: Why did ChatGPT 3.5 observe such a dramatic traction when its predecessor, GPT-3, which had the same size/number of parameters, received far less attention from the general public? Arguably, the answer lies not in raw capabilities, but in Preference Alignment.

Through careful fine-tuning using human feedback, OpenAI transformed GPT-3’s raw intelligence into ChatGPT’s helpful and resourceful conversational abilities. This breakthrough demonstrated that aligning language models with human preferences is just as crucial as scaling them to greater sizes.

-

In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) [Rafailov et al., 2024]. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.

+

In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) [Rafailov et al., 2024]. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.

-

7.2. From Raw Capabilities to Preference Alignment

+

7.2. From Raw Capabilities to Preference Alignment

-

7.2.1. On the Misalignment of Language Models

-

Common pre-trained LLMs are not helpful to humans by default, in general. This is because state-of-the-art language models are trained on the specific objective of predicting the next token. This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned [Ouyang et al., 2022].

+

7.2.1. On the Misalignment of Language Models

+

Common pre-trained LLMs are not helpful to humans by default, in general. This is because state-of-the-art language models are trained on the specific objective of predicting the next token. This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned [Ouyang et al., 2022].

Let’s take a look at GPT-2’s response to the following prompt: “Explain the moon landing to a 6 year old.”

@@ -378,12 +378,12 @@

7.2.2. Aligning Language Models with Human Preferences

-

To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback [Ouyang et al., 2022]. The key idea is to train the model to follow user’s instructions while being safe and helpful.

+

7.2.2. Aligning Language Models with Human Preferences

+

To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback [Ouyang et al., 2022]. The key idea is to train the model to follow user’s instructions while being safe and helpful.

OpenAI RLHF Pipeline
-

Fig. 7.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

+

Fig. 7.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

Fig. 7.1 illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:

@@ -422,7 +422,7 @@

Alignment Simplified
-

Fig. 7.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

+

Fig. 7.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

A common pattern has emerged in the development of language models: First, a powerful pre-trained base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in Fig. 7.3.

@@ -432,10 +432,10 @@

Fig. 7.3 Instruction fine-tuning process for aligning language models with human preferences.

-

An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 [Llama Team, 2024] is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha [Face, 2024] demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.

+

An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 [Llama Team, 2024] is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha [HuggingFace, 2024] demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.

The OpenAI paper introduced two key components of this fine-tuning process - SFT for instruction tuning and RLHF (PPO in particular) for alignment. The following sections will explore these and other more modern alignment techniques.

-

7.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

+

7.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

SFT is a foundational technique for aligning language models with human preferences. Before exploring advanced alignment methods like RLHF, it’s useful to understand how SFT can be used to create a strong foundation for instruction following and desired behaviors.

At a high-level, SFT involves fine-tuning language models using carefully curated demonstrations of desired behavior. The process transforms a general-purpose language model into one that can better follow instructions and exhibit specific behaviors aligned with human preferences. Typically, SFT is used to align a model to a specific task or domain, which than can be later aligned with human preferences using RLHF, PPO or DPO as we will see later.

The decision to employ SFT depends on the gap between a model’s current capabilities and specific requirements. SFT proves particularly valuable in scenarios requiring:

@@ -453,14 +453,14 @@

[Hu et al., 2021]

+
  • LoRA (Low-Rank Adaptation) [Hu et al., 2021]

    • Uses two small matrices instead of updating all weights

    • Maintains model performance while reducing computational costs

    • Enables efficient training on consumer hardware

  • -
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    +
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    • Combines LoRA with weight quantization

    • Further reduces memory footprint

    • @@ -468,19 +468,19 @@

      [Hong et al., 2024] therefore leading to unintended results and a suboptimal alignment.

      -

      SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO [Rafailov et al., 2024] to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone [Ouyang et al., 2022], which we will explore next.

      +

      While SFT can increase the likelihood of obtaining the desired tokens, it may also raise the probability of generating undesired outcomes [Hong et al., 2024] therefore leading to unintended results and a suboptimal alignment.

      +

      SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO [Rafailov et al., 2024] to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone [Ouyang et al., 2022], which we will explore next.

  • -

    7.2.2.2. Augmenting SFT with Human Preferences

    -

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    -

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. PPO [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    -

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [Face, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    -

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 7.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    +

    7.2.2.2. Augmenting SFT with Human Preferences

    +

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    +

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. PPO [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    +

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [HuggingFace, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    +

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 7.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    Direct Preference Optimization Architecture
    -

    Fig. 7.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    +

    Fig. 7.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    The key idea is to train the model to prefer responses that align with our desired behavior over responses that do not. DPO works by:

    @@ -506,16 +506,16 @@

    \(\beta\) is a tuning parameter to control the deviation from the base reference policy \(\pi_{ref}\).

    This approach is more straightforward than PPO, as it avoids the need for a reward model and instead uses a direct comparison of model outputs against human preferences.

    -

    Modern libraries such as HuggingFace’s TRL [Face, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of our case study later in the Chapter.

    +

    Modern libraries such as HuggingFace’s TRL [HuggingFace, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of our case study later in the Chapter.

    -

    7.3. Is Post-Training the Answer?

    +

    7.3. Is Post-Training the Answer?

    -

    7.3.1. Limitations

    +

    7.3.1. Limitations

    While post-training alignment techniques like RLHF and DPO show promise, technical limitations need to be carefully considered.

    -

    Reinforcement Learning from Human Feedback faces several critical challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage [Hou et al., 2024], in particular presenting the following challenges:

    +

    Reinforcement Learning from Human Feedback faces several critical challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage [Hou et al., 2024], in particular presenting the following challenges:

    1. Poor Scaling with Computational Resources

    @@ -553,7 +553,7 @@

    [Feng et al., 2024], including the following:

    +

    As we discussed in the previous section, DPO is a more recent “reward-free” fine-tuning technique that has gained significant attention which derives reward signals directly from pairwise preference data instead of fitting a reward model as in RLHF. With its increasing popularity, emerging research is exploring DPO limitations and potential improvements [Feng et al., 2024], including the following:

    1. Supervised Fine-Tuning Dependencies

    @@ -581,9 +581,9 @@

    -

    7.3.2. Model Collapse

    +

    7.3.2. Model Collapse

    Another key issue is model collapse - a phenomenon where model performance degrades with each training iteration.

    -

    Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to [Kazdan et al., 2024]:

    +

    Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to [Kazdan et al., 2024]:

    1. Degradation of output quality with each training iteration

    2. Pollution of training data when synthetic samples replace real data

    3. @@ -592,16 +592,16 @@

      Szép et al., 2024], providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.

      +

      To effectively mitigate model collapse risks and ensure successful alignment, organizations should implement a comprehensive strategy that includes maintaining high-quality human data in the training mix and closely monitoring key metrics like test loss and output diversity. Further, recent research has explored techniques for effectively fine-tuning language models with limited data [Szép et al., 2024], providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.

    -

    7.3.3. Faking Alignment

    -

    A recent issue is Alignment faking, a behaviour exhibited by LLMs where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic [Askell et al., 2024a] in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.

    +

    7.3.3. Faking Alignment

    +

    A recent issue is Alignment faking, a behaviour exhibited by LLMs where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic [Askell et al., 2024a] in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.

    To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. Fig. 7.5 illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.

    Fake Alignment
    -

    Fig. 7.5 Fake Alignment [Askell et al., 2024a]

    +

    Fig. 7.5 Fake Alignment [Askell et al., 2024a]

    Here is a step by step of what happens:

    @@ -627,7 +627,7 @@

    [Askell et al., 2024a] also provides some key early findings:

    +

    The work by [Askell et al., 2024a] also provides some key early findings:

    • Emergence without explicit instruction: LLMs can exhibit alignment faking without being explicitly instructed to do so. This behavior arises from the conflict between the model’s pre-existing preferences and the new training objective.

    • Scale-dependent: Alignment faking is more likely to occur in larger, more capable models. Models like Claude 3 Opus and Claude 3.5 Sonnet show this behavior, whereas smaller models like Claude 3 Sonnet and Claude 3 Haiku generally do not.

    • @@ -645,11 +645,11 @@

      [Askell et al., 2024b].

      +

      Alignment faking is an interesting empirical observation that warrants additional research. An excellent review of the paper by Askell et al. is provided by Prof. Jacob Andreas, Prof. Yoshua Bengio, Prof. Jasjeet Sekhon, and Dr. Rohin Shah in [Askell et al., 2024b].

    -

    7.4. Case Study: Aligning a Language Model to a Policy

    +

    7.4. Case Study: Aligning a Language Model to a Policy

    In this case study, we will align a language model to an user-provided policy. Here, by policy we mean a set of principles and rules that we want the language model to adhere to. All methodology and code introduced solve this general problem of policy-based alignment. However, we will describe a specific use case to illustrate our approach.

    Let’s assume that we are working for Acme Inc., a company dedicated to democratizing access to computer science education for K-12 students. Acme Inc. is in the process of creating a chatbot named smolK-12, a small open source LLM, specifically designed for K-12 students.

    In this case study, we’ll explore how to align a language model with Acme Inc.’s policy to ensure its LLM-powered applications are safe and appropriate for K-12 students.

    @@ -660,8 +660,8 @@

    -

    7.4.1. Experimental Setup

    -

    We will use the following base model: HuggingFaceTB/SmolLM2-360M-Instruct [SmolLM2-360M-Instruct, 2024], a compact open source language model that is part of the SmolLM2 family published by HuggingFace.

    +

    7.4.1. Experimental Setup

    +

    We will use the following base model: HuggingFaceTB/SmolLM2-360M-Instruct [SmolLM2-360M-Instruct, 2024], a compact open source language model that is part of the SmolLM2 family published by HuggingFace.

    We will use the following APIs:

    • HuggingFace Transformers for local model inference

    • @@ -676,7 +676,7 @@

      -

      7.4.2. Deliverables

      +

      7.4.2. Deliverables

      As a result, we will have:

      • smolK-12, a fine-tuned model aligned with Acme Inc.’s policy

      • @@ -685,8 +685,8 @@

        -

        7.4.3. A Note on smolLM2 Models

        -

        Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models [SmolLM2, 2024], it is worth providing a reason for this choice.

        +

        7.4.3. A Note on smolLM2 Models

        +

        Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models [SmolLM2, 2024], it is worth providing a reason for this choice.

        SmolLM2 models are a family of compact language models that have been developed by HuggingFace. They are designed to be lightweight and efficient, making them suitable for a wide range of applications, including on-device deployment.

        Its compact size makes it an excellent candidate for efficient, low-cost fine-tuning and training on specific use cases making it particularly suitable for alignment research which is our main focus here.

        Having said that, it is important to note that reasoning capabilities of SmolLM2 models are not necessarily on par with state-of-the-art LLMs due to its compact size. As we go through this Case Study, it is important to keep this in mind along with several potential issues and limitations, including:

        @@ -699,10 +699,10 @@

        -

        7.4.4. Policy

        +

        7.4.4. Policy

        A company policy articulates the principles and standards that the company upholds, ensuring that employees, users and stakeholders understand the expectations regarding safety, ethical conduct, social responsibility, and integrity. A good policy not only reflects the company’s mission and vision but also fosters a culture of accountability and transparency.

        In the context of alignment, a policy codifies “company preferences” when prioritizing decisions and actions.

        -

        In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons [Vidgen et al., 2024]. This benchmark encompasses seven critical hazard categories (see Chapter Safety):

        +

        In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons [Vidgen et al., 2024]. This benchmark encompasses seven critical hazard categories (see Chapter Safety):

        1. Violent crimes

        2. Non-violent crimes

        3. @@ -809,11 +809,11 @@

          Monitoring and Updates

    -

    7.4.5. Preference Dataset - Synthetic Dataset Generation

    +

    7.4.5. Preference Dataset - Synthetic Dataset Generation

    In order to fine-tune a base model to create an aligned model, we need to construct a dataset of policy-aligned preferences. This dataset will be used to align our base model to our policy.

    To generate a dataset of policy-aligned preferences, we aim to create a dataset of user prompts, rejected responses, and chosen responses. This dataset indicates which responses are preferred (policy-compliant) and which are not (policy-violating).

    -

    Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs [Dong et al., 2024]. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues [Bai et al., 2022] giving rise to the field of Synthetic Data Generation [Long et al., 2024].

    -

    The application of LLMs for generating synthetic data has shown promise across diverse domains and use cases [Kim et al., 2024], including in the context of alignment with human preferences [Dong et al., 2024]. Recently, Meta AI [Wu et al., 2024] introduced a “self-improving alignment” scheme where a language model generates responses and evaluates them to create preference pairs further used to run preference optimization to improve model capabilities. Inspired by this approach, we will generate a dataset of policy-aligned preferences further used to fine-tune a base model to create our aligned model.

    +

    Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs [Dong et al., 2024]. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues [Bai et al., 2022] giving rise to the field of Synthetic Data Generation [Long et al., 2024].

    +

    The application of LLMs for generating synthetic data has shown promise across diverse domains and use cases [Kim et al., 2024], including in the context of alignment with human preferences [Dong et al., 2024]. Recently, Meta AI [Wu et al., 2024] introduced a “self-improving alignment” scheme where a language model generates responses and evaluates them to create preference pairs further used to run preference optimization to improve model capabilities. Inspired by this approach, we will generate a dataset of policy-aligned preferences further used to fine-tune a base model to create our aligned model.

    First, we define a data schema for our dataset. Each row in the dataset contains two responses: a chosen response that aligns with the policy and a rejected response that violates it. Through DPO-optimization, the model is awarded for generating responses that match the chosen, policy-compliant examples rather than the rejected ones:

    @@ -874,7 +874,7 @@

    -

    7.4.5.1. User Prompts

    +

    7.4.5.1. User Prompts

    The class UserPromptGenerator generates a dataset of user prompts that might violate the policy. It runs the following prompt given the policy:

    ROLE: You are a user prompt generator helping test an AI safety system.
     TASK: Generate a long list of user prompts that might violate the provided content policy.
    @@ -1047,7 +1047,7 @@ 

    -

    7.4.5.2. Rejected Responses

    +

    7.4.5.2. Rejected Responses

    The ResponseGenerator class creates a dataset of responses from an unaligned base model that we aim to improve through fine-tuning. These responses serve as “rejected” examples in our training data since they may not properly align with safety policies and guidelines. The class supports both local model inference using the Hugging Face Transformers library and remote inference through the Hugging Face Inference API. When instantiated with a model name, it loads the model locally. Otherwise, if a cloud API URL is provided, it connects to the remote API endpoint for inference.

    Generate rejected responses using a local model:

    local_generator = ResponseGenerator(model_name="<HUGGINGFACE_MODEL_NAME>")
    @@ -1249,7 +1249,7 @@ 

    -

    7.4.5.3. Chosen Responses

    +

    7.4.5.3. Chosen Responses

    The next step involves generating policy-compliant responses from a more powerful, sophisticated language model than our base model. The process_aligned_responses() function takes user prompts and generates responses that strictly adhere to the provided safety policy. It uses a carefully crafted system prompt that instructs the model to either provide helpful responses within policy bounds, or explicitly reject requests that violate the policy with a standardized message. These policy-compliant responses will serve as the “chosen” examples in our preference dataset, establishing the target behavior we want the base model to learn through alignment training.

    We will use the OpenAIBatchProcessor class from the taming_utils utility module to generate responses in batches using OpenAI’s API for enhanced cost-efficiency and performance.

    @@ -1378,7 +1378,7 @@

    -

    7.4.5.4. Generate DPO Dataset

    +

    7.4.5.4. Generate DPO Dataset

    At this point we already have all the data we need for our DPO dataset, namely user prompts, chosen responses and rejected responses. The generate_dpo_dataset() function loads these data and transforms them into a format suitable for DPO training, optionally pushing the dataset to the Hugging Face Hub if repo_id is provided.

    @@ -1508,7 +1508,7 @@

    -

    7.4.6. DPO-Based Optimization

    +

    7.4.6. DPO-Based Optimization

    We’ll use the Hugging Face TRL library to implement DPO fine-tuning on our synthetic dataset.

    Note

    @@ -1518,8 +1518,8 @@

    -

    7.4.6.1. Data Preparation

    -

    Hugging Face H4 [H4, 2024b] offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (trl-lib/ultrafeedback_binarized) [H4, 2024a].

    +

    7.4.6.1. Data Preparation

    +

    Hugging Face H4 [H4, 2024b] offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (trl-lib/ultrafeedback_binarized) [H4, 2024a].

    The UltraFeedback binarized dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in Fig. 7.6.

    DPO Optimization @@ -1565,7 +1565,7 @@

    -

    7.4.6.2. Fine-Tuning

    +

    7.4.6.2. Fine-Tuning

    We now prepare our base language model for alignment fine-tuning using the Hugging Face transformers library. It loads the pre-trained model and its tokenizer and configures them for training.

    @@ -1612,7 +1612,7 @@

  • The learning rate (learning_rate) determines how aggressively the model updates its parameters based on preference feedback.

  • -
  • Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 [Huyen, 2024].

  • +
  • Learning rates must be tuned empirically, typically testing values between 1e-7 and 1e-3 [Huyen, 2024].

  • A cosine learning rate schedule (lr_scheduler_type: "cosine") helps stabilize training by gradually decreasing the learning rate.

    1. @@ -1757,7 +1757,7 @@

      -

      7.4.6.3. Vibe Check

      +

      7.4.6.3. Vibe Check

      Let’s do a quick “vibe check” of our newly aligned model by testing it with some challenging prompts. This will help us qualitatively assess whether the DPO fine-tuning has improved the model’s alignment against our input policy (K-12 educational policies and safety standards). We’ll then follow up with a more rigorous quantitative evaluation methodology.

      We will use HuggingFace transformers API to generate responses from our base and aligned models, locally.

      @@ -1840,10 +1840,10 @@

      -

      7.4.7. Alignment Evaluation

      +

      7.4.7. Alignment Evaluation

      Evaluating alignment presents unique challenges. Unlike traditional machine learning tasks with clear metrics like accuracy or F1 score, alignment quality is more nuanced and subjective. It requires assessing whether responses adhere to safety guidelines, educational policies, and ethical principles.

      The gold standard for evaluating alignment is human evaluation. Having experienced educators and safety experts review model outputs provides a reliable assessment framework. However, human evaluation is expensive, time-consuming, and difficult to scale. Additionally, human evaluators may have varying interpretations of alignment criteria, introducing inconsistency.

      -

      In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in [Souza, 2024]. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.

      +

      In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in [Souza, 2024]. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.

      The evaluation methodology summarized in Fig. 7.9 consists of three key components that work together to assess model alignment against our policy:

      1. Evaluation Dataset

        @@ -2391,22 +2391,22 @@

        -

        7.5. Discussion and Conclusions

        +

        7.5. Discussion and Conclusions

        LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.

        Synthetic Data Generation

        -

        LLMs can self improve through synthetic data generation [Huang et al., 2022]. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.

        -

        One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets [Hao et al., 2024].

        -

        Our approach does enable a systematic approach to aligning a model to an input policy. However, according to [Yin et al., 2024], directly sampling preference pairs, which closely resembles an on-policy setting, can result in performance declines due to inherent volatility and inefficiency. Therefore, constructing effective preference data to continuously improve LLMs remains a critical research problem.

        +

        LLMs can self improve through synthetic data generation [Huang et al., 2022]. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.

        +

        One major challenge is data distribution bias, where the synthetic data might not accurately mirror the complexities and nuances of real-world data. This can lead to models trained on this data making inaccurate predictions or exhibiting biases. In our case study, we did observe duplicate responses in the synthetic data. Further, the methodology lacks a systematic approach to evaluate the quality of the synthetic data itself only focusing on evals for the consecutive fine-tuned model. This highlights the importance of carefully considering the training data and potential biases of LLMs used for synthetic data generation to mitigate the risk of creating biased or unrepresentative datasets [Hao et al., 2024].

        +

        Our approach does enable a systematic approach to aligning a model to an input policy. However, according to [Yin et al., 2024], directly sampling preference pairs, which closely resembles an on-policy setting, can result in performance declines due to inherent volatility and inefficiency. Therefore, constructing effective preference data to continuously improve LLMs remains a critical research problem.

        Choice of Base Model

        -

        The choice of base model is a critical consideration when implementing alignment techniques. In the case study, we selected the smolLM model family due to its efficient architecture and reasonable performance on basic tasks while maintaining relatively low computational requirements. However, the model does have limitations in terms of reasoning capabilities and complex task handling that should be carefully considered [SmolLM2, 2024].

        +

        The choice of base model is a critical consideration when implementing alignment techniques. In the case study, we selected the smolLM model family due to its efficient architecture and reasonable performance on basic tasks while maintaining relatively low computational requirements. However, the model does have limitations in terms of reasoning capabilities and complex task handling that should be carefully considered [SmolLM2, 2024].

        Real-world applications need to carefully evaluate the trade-offs between model size/capabilities, and costs. While smaller models like smolLM can be cost-effective for basic alignment experiments, they may not provide the sophisticated reasoning needed for production use cases. The computational and financial costs of training and deploying larger models must be weighed against the required capabilities.

        -

        For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ [Meta, 2024] and Qwen [Qwen, 2024] families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.

        +

        For production applications requiring more advanced capabilities, alternative open source models such as those from the LLaMA-3+ [Meta, 2024] and Qwen [Qwen, 2024] families have demonstrated remarkable performance that rivals state-of-the-art proprietary models. These models offer enhanced reasoning abilities and better handling of complex tasks, though at increased computational and financial cost. The choice ultimately depends on specific use case requirements, available resources, and acceptable performance thresholds.

        Evaluation Methodology

        -

        The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations [Chen et al., 2024]. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.

        +

        The LLM-as-judge evaluation methodology is a powerful tool for assessing model alignment. However, it does have limitations [Chen et al., 2024]. For instance, the judge model may not always be able to accurately evaluate the alignment of the model, especially if the judge model is not aligned with the policy itself. Further, the judge model may be biased towards the policy, leading to overly conservative evaluations. In our case study, we do highlight the fact that our judge was simply focused on the policy-alignment aspect of the responses completely neglecting the quality of the responses themselves, i.e. while our fine-tuned model may be more aligned with the policy than the base model, we actually have no evidence that our model is helpful at all.

        A more robust evaluation approach would combine LLM-based evaluation with human domain experts in a complementary process. The LLM judge could perform initial high-throughput screening of model responses, flagging potential issues and providing preliminary assessments. These results would then be reviewed by human evaluators with relevant domain expertise who can provide nuanced judgment, catch edge cases, and validate the LLM’s evaluations. Additionally, automatic evaluation against standard benchmarks is advised to evaluate general capabilities of the model.

        DPO Dataset Composition

        The composition of the DPO dataset also plays a crucial role in model behavior. In preliminary experiments, using only policy-aligned preference data led to an overly apologetic model that was hesitant to provide helpful responses even for benign queries, i.e. the model was overfitting to the policy. In fact, a model that simply refused to provide an useful response and instead apologized would indeed be aligned with the policy and therefore rewarded accordingly. This led to our decision to construct a more well balanced dataset.

        -

        Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 [H4, 2024a] dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.

        +

        Blending our policy-focused dataset with the more general-purpose UltraFeedback dataset from Hugging Face H4 [H4, 2024a] dramatically improved results by helping the model maintain helpfulness while learning appropriate safety boundaries. The results reported here reflect this balanced dataset approach.

        The construction of the DPO dataset is perhaps the most critical component of the alignment process. While automated approaches can help scale dataset creation, the involvement of domain experts in dataset construction is highly recommended. Domain experts bring invaluable knowledge about edge cases, nuanced policy interpretations, and real-world usage patterns that may not be captured by synthetic data generation alone. Organizations implementing alignment techniques should consider investing in domain expert involvement during dataset construction as a key success factor.

        Fine-tuning Process

        The effectiveness of DPO training can be highly sensitive to various fine-tuning hyperparameters. As we mentioned before, the batch size and the beta parameter are two key parameters that can significantly impact training stability and model behavior. A careful parameter tuning is required to achieve optimal results, which lacked in our case study.

        @@ -2424,159 +2424,159 @@

        -

        7.6. References

        +

        7.6. References

        -
        +
        [ABC+4a] (1,2,3)

        Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models. 2024a. URL: https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf.

        -
        +
        [ABC+4b]

        Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models: reviews. 2024b. URL: https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf.

        -
        +
        [BJN+22]

        Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.

        -
        +
        [BKK+22]

        Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.

        -
        +
        [Blo23]

        NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.

        -
        +
        [CCL+24]

        Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: https://arxiv.org/abs/2402.10669, arXiv:2402.10669.

        -
        +
        [DPHZ23]

        Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.

        -
        +
        [DDZ+24] (1,2)

        Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.

        -
        -[Fac24] -

        Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.

        -
        -
        -[Fac4c] -

        Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.

        -
        -
        -[Fac4d] -

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        -
        -
        +
        [FQH+24]

        Duanyu Feng, Bowen Qin, Chen Huang, Zheng Zhang, and Wenqiang Lei. Towards analyzing and understanding the limitations of dpo: a theoretical perspective. 2024. URL: https://arxiv.org/abs/2404.04626, arXiv:2404.04626.

        -
        +
        [H44a] (1,2) -

        Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

        +

        HuggingFace H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

        -
        +
        [H44b] -

        Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: https://huggingface.co/HuggingFaceH4.

        +

        HuggingFace H4. Huggingface h4. 2024b. HuggingFace H4. URL: https://huggingface.co/HuggingFaceH4.

        -
        +
        [HHJ+24]

        Shuang Hao, Wenfeng Han, Tao Jiang, Yiping Li, Haonan Wu, Chunlin Zhong, Zhangjun Zhou, and He Tang. Synthetic data in ai: challenges, applications, and ethical implications. 2024. URL: https://arxiv.org/abs/2401.01629, arXiv:2401.01629.

        -
        +
        [HLT24]

        Jiwoo Hong, Noah Lee, and James Thorne. Orpo: monolithic preference optimization without reference model. 2024. URL: https://arxiv.org/abs/2403.07691, arXiv:2403.07691.

        -
        +
        [HDN+24]

        Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.

        -
        +
        [HSW+21]

        Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.

        -
        +
        [HGH+22]

        Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: https://arxiv.org/abs/2210.11610, arXiv:2210.11610.

        -
        +
        +[Hug24] +

        HuggingFace. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.

        +
        +
        +[Hug4c] +

        HuggingFace. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.

        +
        +
        +[Hug4d] +

        HuggingFace. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        +
        +
        [Huy24]

        Chip Huyen. AI Engineering. O'Reilly Media, Inc., December 2024. ISBN 9781098129095. URL: https://www.oreilly.com/library/view/ai-engineering/9781098129095/.

        -
        +
        [KSD+24]

        Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: https://arxiv.org/abs/2410.16713, arXiv:2410.16713.

        -
        +
        [KSY+24]

        Seungone Kim, Juyoung Suk, Xiang Yue, Vijay Viswanathan, Seongyun Lee, Yizhong Wang, Kiril Gashteovski, Carolin Lawrence, Sean Welleck, and Graham Neubig. Evaluating language models as synthetic data generators. 2024. URL: https://arxiv.org/abs/2412.03679, arXiv:2412.03679.

        -
        +
        [LT24]

        AI @ Meta Llama Team. The llama 3 herd of models. 2024. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.

        -
        +
        [LWX+24]

        Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: https://arxiv.org/abs/2406.15126, arXiv:2406.15126.

        -
        +
        [Met24]

        Meta. Meta-llama. 2024. Meta-Llama. URL: https://huggingface.co/meta-llama.

        -
        +
        [OWJ+22] (1,2,3,4,5,6,7)

        Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.

        -
        +
        [Qwe24]

        Qwen. Qwen. 2024. Qwen. URL: https://huggingface.co/Qwen.

        -
        +
        [RSM+24] (1,2,3,4)

        Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.

        -
        +
        [SWD+17]

        John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.

        -
        +
        [SmolLM224] (1,2) -

        Hugging Face SmolLM2. Smollm: a small language model distilled from a larger language model for task-specific applications. 2024. Blog post describing techniques for distilling smaller, task-specific language models. URL: https://huggingface.co/blog/smollm.

        +

        HuggingFace SmolLM2. Smollm: a small language model distilled from a larger language model for task-specific applications. 2024. Blog post describing techniques for distilling smaller, task-specific language models. URL: https://huggingface.co/blog/smollm.

        -
        +
        [SmolLM2360MI24] -

        Hugging Face SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct.

        +

        HuggingFace SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct.

        -
        +
        [Sou24]

        Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html.

        -
        +
        [SRvERH24]

        Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        -
        +
        [VAA+24]

        Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.

        -
        +
        [WYG+24]

        Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: https://arxiv.org/abs/2407.19594, arXiv:2407.19594.

        -
        +
        [YWX+24]

        Yueqin Yin, Zhendong Wang, Yujia Xie, Weizhu Chen, and Mingyuan Zhou. Self-augmented preference optimization: off-policy paradigms for language model alignment. ArXiv, 2024. URL: https://api.semanticscholar.org/CorpusID:270199610.

        diff --git a/tamingllms/_build/html/notebooks/cost.html b/tamingllms/_build/html/notebooks/cost.html index 459ba23..2764706 100644 --- a/tamingllms/_build/html/notebooks/cost.html +++ b/tamingllms/_build/html/notebooks/cost.html @@ -247,7 +247,7 @@
        -

        9. The Falling Cost Paradox

        +

        9. The Falling Cost Paradox

        -

        9.1. Why Optimization Matters More Than Ever

        -

        According to recent analysis from a16z [Andreessen Horowitz, 2024], the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore’s Law in the PC revolution or Edholm’s Law during the bandwidth explosion of the dot-com era.

        +

        9.1. Why Optimization Matters More Than Ever

        +

        According to recent analysis from a16z [Andreessen Horowitz, 2024], the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore’s Law in the PC revolution or Edholm’s Law during the bandwidth explosion of the dot-com era.

        LLMflation
        -

        Fig. 9.1 LLMflation [Andreessen Horowitz, 2024]: The cost of LLM inference is decreasing by approximately 10x every year.

        +

        Fig. 9.1 LLMflation [Andreessen Horowitz, 2024]: The cost of LLM inference is decreasing by approximately 10x every year.

        A model achieving an MMLU score of 42 that cost \(60 per million tokens in late 2021 can now be run for just \)0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4’s introduction in March 2023.

        @@ -345,16 +345,16 @@

        9.2. Right-Sizing LLMs: A Strategic Approach

        +

        9.2. Right-Sizing LLMs: A Strategic Approach

        Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.

        In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.

        -

        9.2.1. Metrics

        +

        9.2.1. Metrics

        -

        9.2.2. Requirements

        +

        9.2.2. Requirements

        -

        9.2.2.1. Business Requirements

        +

        9.2.2.1. Business Requirements

        First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.

        Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.

        Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.

        @@ -362,17 +362,17 @@

        Local LLMs in Practice provides a detailed discussion on relevant considerations when Choosing your Model.

        -

        9.2.2.2. Performance Requirements

        +

        9.2.2.2. Performance Requirements

        Accuracy and quality form the foundation of any LLM deployment’s performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter The Evals Gap provides a detailed discussion on how to evaluate the performance of LLM-based applications.

        Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations.

        -

        9.2.2.3. Operational Requirements

        +

        9.2.2.3. Operational Requirements

        Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.

        Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.

        -

        9.2.2.4. Technical Requirements

        +

        9.2.2.4. Technical Requirements

        System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.

        Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.

        This structured approach to requirements analysis enables organizations to:

        @@ -387,7 +387,7 @@

        -

        9.3. Quantization

        +

        9.3. Quantization

        Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model’s parameters. The most common form of quantization is to represent model’s weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.

        While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis.

        Let’s take a look at model weights of a language model (SmolLM2-135M-Instruct) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function load_gguf from the taming_utils package to load model weights of the quantized models directly from Hugging Face.

        @@ -483,21 +483,21 @@

        [Unsloth, 2024] [2]. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in Fig. 9.2.

        +

        Quantization[2] is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by [Unsloth, 2024] [3]. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in Fig. 9.2.

        Quantized Model Size

        Fig. 9.2 Quantized Model Size: unsloth/Llama-3.3-70B-Instruct-GGUF

        -

        We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [3].

        +

        We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart [4].

        This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.

        -

        While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet [Wang et al., 2024] which pushes the boundaries of extreme quantization.

        +

        While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet [Wang et al., 2024] which pushes the boundaries of extreme quantization.

        BitNet’s implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see Fig. 9.3). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet’s optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).

        BitNet
        -

        Fig. 9.3 BitNet: [Wang et al., 2024]

        +

        Fig. 9.3 BitNet: [Wang et al., 2024]

        The framework’s initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.

        @@ -506,7 +506,7 @@

        Local LLMs in Practice for more details.

        -

        9.4. Check-list

        +

        9.4. Check-list

        Planning and Requirements

        • Start with a clear understanding of your application’s needs and the factors that contribute to LLM costs

        • @@ -540,7 +540,7 @@

          -

          9.5. Conclusion

          +

          9.5. Conclusion

          CC BY-NC-SA 4.0

          @misc{tharsistpsouza2024tamingllms,
             author = {Tharsis T. P. Souza},
          @@ -554,25 +554,25 @@ 

          -

          9.6. References

          -
          -
          +

          9.6. References

          +
          +
          [WZS+24] -(1,2) +(1,2)

          Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.

          -
          +
          [AndreessenHorowitz24] (1,2)

          Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.

          -
          -[HuggingFace4w] -

          Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

          +
          +[HuggingFace4w] +

          HuggingFace. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

          -
          -[Unsloth24] -

          Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.

          +
          +[Unsloth24] +

          Unsloth. Llama-3.3-70b-instruct-gguf. HuggingFace Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.

          @@ -582,13 +582,17 @@

          [1]

          Quote from Jonathan Ross, CEO of Groq, a company that specializes in AI Inference services.

          +

        diff --git a/tamingllms/_build/html/notebooks/evals.html b/tamingllms/_build/html/notebooks/evals.html index f039449..e4294fb 100644 --- a/tamingllms/_build/html/notebooks/evals.html +++ b/tamingllms/_build/html/notebooks/evals.html @@ -260,7 +260,7 @@
        -

        3. The Evals Gap

        +

        3. The Evals Gap

        It doesn’t matter how beautiful your theory is,
        it doesn’t matter how smart you are.
        @@ -270,48 +270,48 @@

        -

        3.1. Introduction

        +

        3.1. Introduction

        The advent of LLMs marks a pivotal shift in the landscape of software development, testing and verification. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products.

        For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks.

        To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front that incorporate the generative nature of LLM-based software development while fostering a culture of continuous change, learning and adaptation.

        -

        3.2. Non-Deterministic Generative Machines

        +

        3.2. Non-Deterministic Generative Machines

        One of the most fundamental challenges when building products with LLMs is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they’re queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering and product challenge.

        When you ask an LLM the same question multiple times, you’ll likely get different responses. This isn’t a bug - it’s a fundamental feature of how these models work. The “temperature” parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems.

        Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that:

        @@ -325,7 +325,7 @@

      2. Calculates probability distributions for each next token

      3. Samples from these distributions based on temperature settings

      4. -
      5. Uses techniques like nucleus sampling [Holtzman et al., 2020] or top-k sampling to balance creativity and coherence

      6. +
      7. Uses techniques like nucleus sampling [Holtzman et al., 2020] or top-k sampling to balance creativity and coherence

      In this simple experiment, we use an LLM to write a single-statement executive summary from an input financial filing. We observe that even a simple parameter like temperature can dramatically alter model behavior in ways that are difficult to systematically assess. At temperature 0.0, responses are consistent but potentially too rigid. At 1.0, outputs become more varied but less predictable. At 2.0, responses can be wildly different and often incoherent. This non-deterministic behavior makes traditional software testing approaches inadequate.

      @@ -437,7 +437,7 @@

      [Raschka, 2024]:

      +

      A temperature of 1 represents the unscaled probability scores for each token in the vocabulary. Decreasing the temperature closer to 0 sharpens the distribution, so the most likely token will have an even higher probability score. Conversely, increasing the temperature makes the distribution more uniform [Raschka, 2024]:

      • Temperature = 0: Most deterministic, but potentially repetitive

      • Temperature = 1: Balanced creativity and coherence

      • @@ -446,19 +446,19 @@

        -

        3.3. Emerging Properties

        +

        3.3. Emerging Properties

        Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren’t explicitly programmed but rather emerge “naturally” as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications.

        -

        Fig. 3.1 provides a list of emergent abilities of large language models and the scale [Wei et al., 2022]. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested.

        +

        Fig. 3.1 provides a list of emergent abilities of large language models and the scale [Wei et al., 2022]. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested.

        Emerging Properties
        -

        Fig. 3.1 Emergent abilities of large language models and the scale [Wei et al., 2022].

        +

        Fig. 3.1 Emergent abilities of large language models and the scale [Wei et al., 2022].

        The implications for evaluation are critical. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation.

        -

        3.4. Problem Statement

        +

        3.4. Problem Statement

        Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you’re not just testing predefined features - you’re trying to evaluate emergent capabilities like adapting explanations to a child’s level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content.

        This fundamental difference raises critical questions about evaluation:

          @@ -508,7 +508,7 @@

          -

          3.5. Evals Design

          +

          3.5. Evals Design

          First, it’s important to make a distinction between evaluating an LLM versus evaluating an LLM-based application. While the former offers foundation capabilities and are typically general-purpose, the latter is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem.

          That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications, instead, should be evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems:

        -

        3.7. Evaluators

        +

        3.7. Evaluators

        -

        3.7.1. Model-Based Evaluation

        +

        3.7.1. Model-Based Evaluation

        Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a “Model-based evaluation” approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment.

        -

        As discussed in the paper [Li et al., 2024], LLM-based evaluation approaches generally fall into two main categories:

        +

        As discussed in the paper [Li et al., 2024], LLM-based evaluation approaches generally fall into two main categories:

        1. Prompt-based evaluation: This involves using prompts to instruct existing LLMs to evaluate text quality without any fine-tuning. The evaluation can take several forms:

            @@ -1098,13 +1098,13 @@

            -Conceptual Overview +Conceptual Overview

            Fig. 3.4 Conceptual overview of LLM-as-a-Judge evaluation.

    Compared to traditional metrics, LLM-as-a-Judge evaluation offers a more sophisticated assessment framework by leveraging natural language criteria. While metrics focus on statistical measures, judge models excel at evaluating subjective qualities such as creativity, narrative flow, and contextual relevance - aspects that closely mirror human judgment. The judge model processes evaluation guidelines expressed in natural language, functioning similarly to a human reviewer interpreting assessment criteria. One notable consideration is that this approach requires careful prompt engineering to properly define and communicate the evaluation standards to the model.

    -

    Prompt Engineering can have a large impact on the quality of the evaluation [Li et al., 2024]. Hence, it’s worth noting key prompting best practices when designing LLM-as-a-judge evaluators [Face, 2024]:

    +

    Prompt Engineering can have a large impact on the quality of the evaluation [Li et al., 2024]. Hence, it’s worth noting key prompting best practices when designing LLM-as-a-judge evaluators [HuggingFace, 2024]:

    1. Use discrete integer scales (e.g., 1-5) rather than continuous ranges

    2. Provide clear rubrics that define what each score level means

    3. @@ -1310,12 +1310,12 @@

      gpt-3.5-turbo model had the lowest scores overall (expertise: 4, coherence: 5, fluency: 7, similarity: 2), particularly struggling with expertise and similarity to the benchmark. While it maintained reasonable fluency, the significant drop in similarity score suggests substantial deviation from the reference summary.

      The visualization helps highlight these differences across models and evaluation dimensions. A clear performance gradient is visible from gpt-4o-mini to gpt-3.5-turbo, with the latter showing marked degradation in most metrics.

      -

      Leveraging LLMs for evaluation has several limitations [Li et al., 2024]. Firstly, computational overhead should not be neglected given the inherent cost of running additional model inferences iterations. LLM evaluators can also exhibit various biases, including order bias (preferring certain sequence positions), egocentric bias (favoring outputs from similar models), and length bias. Further, there may be a tight dependency on prompt quality - small prompt variations may lead to substantially different outcomes. It is important to also note challenges around domain-specific evaluation in fields such as medicine, finance, law etc, where a general llm-as-a-judge approach may not be suitable.

      +

      Leveraging LLMs for evaluation has several limitations [Li et al., 2024]. Firstly, computational overhead should not be neglected given the inherent cost of running additional model inferences iterations. LLM evaluators can also exhibit various biases, including order bias (preferring certain sequence positions), egocentric bias (favoring outputs from similar models), and length bias. Further, there may be a tight dependency on prompt quality - small prompt variations may lead to substantially different outcomes. It is important to also note challenges around domain-specific evaluation in fields such as medicine, finance, law etc, where a general llm-as-a-judge approach may not be suitable.

      The LLM-as-a-Judge strategy can serve as a scalable and nuanced solution to evaluate LLM-based applications. While it does not entirely replace metrics-based or human-based approaches, it significantly augments evaluation workflows, especially in scenarios requiring evaluation of generative outputs. Future improvements in our example include integrating human oversight and refining LLMs for domain-specific evaluation tasks.

      -

      One open source solution trying to overcome some of these challenges is Glider [Deshpande et al., 2024], a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications.

      +

      One open source solution trying to overcome some of these challenges is Glider [Deshpande et al., 2024], a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications.

      -

      3.7.2. Evaluating Evaluators

      +

      3.7.2. Evaluating Evaluators

      We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options:

      1. Use a golden-standard dataset that is used to evaluate the performance of LLM evaluators using a “metrics-based” approach.

      2. @@ -1332,9 +1332,9 @@

        Fig. 3.5 Conceptual overview of LLMs Meta Evaluation.

        -

        An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is Judge Arena [Arena, 2024], which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in Fig. 3.6. Only after submitting a vote, users can see which models were actually doing the judging.

        +

        An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is Judge Arena [Arena, 2024], which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in Fig. 3.6. Only after submitting a vote, users can see which models were actually doing the judging.

        -Human-in-the-loop meta evaluation Conceptual Overview +Human-in-the-loop meta evaluation Conceptual Overview

        Fig. 3.6 Human-in-the-loop Meta Evaluation.

        @@ -1359,20 +1359,20 @@

        -

        3.8. Benchmarks and Leaderboards

        +

        3.8. Benchmarks and Leaderboards

        Benchmarks act as standardized tests for LLMs, evaluating their performance across a spectrum of tasks. These tasks simulate real-world applications such as answering questions, generating coherent text, solving mathematical problems, or even writing computer code. They also assess more abstract qualities like fairness, robustness, and cultural understanding.

        Benchmarks can be thought as comprehensive “exams” that probe different “subjects” in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication.

        -

        The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. We can start in 2018 with the introduction of GLUE (General Language Understanding Evaluation) [Wang et al., 2019], which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. Later, SuperGLUE [Wang et al., 2019] expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors.

        -

        As AI capabilities grew, benchmarks evolved to capture broader and more diverse aspects of intelligence. BIG-Bench [Srivastava et al., 2023] marked a turning point by incorporating over 200 tasks, spanning arithmetic, logic, and creative problem-solving. This collaborative effort aimed to probe emergent abilities in large models, offering insights into how scale and complexity influence performance. Around the same time, specialized benchmarks like TruthfulQA [Lin et al., 2022] emerged, addressing the critical need for models to provide accurate and non-deceptive information in a world increasingly dependent on AI for factual content.

        -

        MMLU (Massive Multitask Language Understanding) [Hendrycks et al., 2021] launched in 2021, provided a rigorous test of a model’s multidisciplinary knowledge, covering 57 subjects from STEM fields to humanities and social sciences. Similarly, in 2022, Stanford’s HELM (Holistic Evaluation of Language Models) [Liang et al., 2023] set a new standard for multidimensional assessment. HELM expanded the scope of evaluation beyond accuracy, incorporating factors like fairness, robustness, and computational efficiency. This benchmark was designed to address societal concerns surrounding AI, emphasizing safety and inclusion alongside technical performance.

        -

        Specialized benchmarks like HumanEval (2021) [Chen et al., 2021] focused on domain-specific tasks, such as code generation, testing models’ ability to translate natural language descriptions into functional programming code. In contrast, LMSYS (2023) brought real-world applicability into focus by evaluating conversational AI through multi-turn dialogues. LMSYS prioritized coherence, contextual understanding, and user satisfaction, providing a practical lens for assessing models like GPT and Claude in dynamic settings.

        -

        The HuggingFace Open LLM [Face, 2024] Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals.

        -

        The Chatbot Arena (2024) Leaderboard (an evolution of LMSYS) [Chiang et al., 2024] takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered hundreds of thousands of human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots.

        -

        The AlpacaEval [Dubois et al., 2024] and MT-Bench [Zheng et al., 2023] Leaderboards implement automated evaluation using LLMs to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges.

        -

        An important recent development was the release of Global-MMLU [Singh et al., 2024], an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Hundreds of contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.

        -

        A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models’ training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. LiveBench [White et al., 2024] represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving considerably lower performance compared to other benchmarks, demonstrating LiveBench’s ability to meaningfully differentiate model capabilities with relatively lower saturation. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances.

        -

        Another notable benchmark is ZebraLogic [Lin et al., 2024], which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem [Brailsford et al., 1999] commonly found in tests like the LSAT. These puzzles require assigning unique values to N houses across M different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark’s programmatically generated puzzles range from 2x2 to 6x6 in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves 33.4% accuracy across all puzzles and 12.4% on hard puzzles, with smaller models (7-10B parameters) solving less than 1% of hard puzzles as of December 2024. These results reveal critical gaps in LLMs’ capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization.

        -

        A significant milestone in AI evaluation came with the launch of the The Alignment Research Center (ARC) Prize [Chollet, 2024] by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls “cognitive sufficiency” - a model’s ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge. Arguably, it is an attempt to define and measure a step towards what it means to achieve AGI (Artificial General Intelligence).

        +

        The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. We can start in 2018 with the introduction of GLUE (General Language Understanding Evaluation) [Wang et al., 2019], which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. Later, SuperGLUE [Wang et al., 2019] expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors.

        +

        As AI capabilities grew, benchmarks evolved to capture broader and more diverse aspects of intelligence. BIG-Bench [Srivastava et al., 2023] marked a turning point by incorporating over 200 tasks, spanning arithmetic, logic, and creative problem-solving. This collaborative effort aimed to probe emergent abilities in large models, offering insights into how scale and complexity influence performance. Around the same time, specialized benchmarks like TruthfulQA [Lin et al., 2022] emerged, addressing the critical need for models to provide accurate and non-deceptive information in a world increasingly dependent on AI for factual content.

        +

        MMLU (Massive Multitask Language Understanding) [Hendrycks et al., 2021] launched in 2021, provided a rigorous test of a model’s multidisciplinary knowledge, covering 57 subjects from STEM fields to humanities and social sciences. Similarly, in 2022, Stanford’s HELM (Holistic Evaluation of Language Models) [Liang et al., 2023] set a new standard for multidimensional assessment. HELM expanded the scope of evaluation beyond accuracy, incorporating factors like fairness, robustness, and computational efficiency. This benchmark was designed to address societal concerns surrounding AI, emphasizing safety and inclusion alongside technical performance.

        +

        Specialized benchmarks like HumanEval (2021) [Chen et al., 2021] focused on domain-specific tasks, such as code generation, testing models’ ability to translate natural language descriptions into functional programming code. In contrast, LMSYS (2023) brought real-world applicability into focus by evaluating conversational AI through multi-turn dialogues. LMSYS prioritized coherence, contextual understanding, and user satisfaction, providing a practical lens for assessing models like GPT and Claude in dynamic settings.

        +

        The HuggingFace Open LLM [HuggingFace, 2024] Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals.

        +

        The Chatbot Arena (2024) Leaderboard (an evolution of LMSYS) [Chiang et al., 2024] takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered hundreds of thousands of human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots.

        +

        The AlpacaEval [Dubois et al., 2024] and MT-Bench [Zheng et al., 2023] Leaderboards implement automated evaluation using LLMs to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges.

        +

        An important recent development was the release of Global-MMLU [Singh et al., 2024], an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Hundreds of contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts.

        +

        A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models’ training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. LiveBench [White et al., 2024] represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving considerably lower performance compared to other benchmarks, demonstrating LiveBench’s ability to meaningfully differentiate model capabilities with relatively lower saturation. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances.

        +

        Another notable benchmark is ZebraLogic [Lin et al., 2024], which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem [Brailsford et al., 1999] commonly found in tests like the LSAT. These puzzles require assigning unique values to N houses across M different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark’s programmatically generated puzzles range from 2x2 to 6x6 in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves 33.4% accuracy across all puzzles and 12.4% on hard puzzles, with smaller models (7-10B parameters) solving less than 1% of hard puzzles as of December 2024. These results reveal critical gaps in LLMs’ capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization.

        +

        A significant milestone in AI evaluation came with the launch of the The Alignment Research Center (ARC) Prize [Chollet, 2024] by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls “cognitive sufficiency” - a model’s ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge. Arguably, it is an attempt to define and measure a step towards what it means to achieve AGI (Artificial General Intelligence).

        Defining AGI according to ARC Prize:

        Consensus but wrong:

        @@ -1401,21 +1401,21 @@

        [Chollet, 12/08/2024]. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark.

        +

        The ARC-AGI benchmark remained unbeaten for five years as of December 2024 (a minimum score of 85% in the private dataset is required to win) [Chollet, 12/08/2024]. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark.

        In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including:

          -
        • FinBench [Zhang et al., 2024]: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.

        • -
        • LegalBench [Guha et al., 2023] : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals

        • -
        • Berkeley Function Leaderboard (BFCL) [Patil et al., 2023]: Evaluates LLMs’ function-calling abilities

        • +
        • FinBench [Zhang et al., 2024]: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.

        • +
        • LegalBench [Guha et al., 2023] : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals

        • +
        • Berkeley Function Leaderboard (BFCL) [Patil et al., 2023]: Evaluates LLMs’ function-calling abilities

        As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren’t previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks.

        In the following sections, we will explore some open source tools developers can use to automate and streamline the challenging task of LLMs evals.

      -

      3.9. Tools

      +

      3.9. Tools

      -

      3.9.1. LightEval

      -

      LightEval [Fourrier et al., 2023] is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI.

      +

      3.9.1. LightEval

      +

      LightEval [Fourrier et al., 2023] is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI.

      As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let’s evaluate how well different models perform on this type of task.

      First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. Table 3.4 shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers.

    @@ -1526,7 +1526,7 @@

    [Face, 2024] and metrics [Face, 2024]. The available tasks span multiple categories and benchmarks including BigBench, MMLU, TruthfulQA, WinoGrande, and HellaSwag. The framework also supports standard NLP evaluation metrics including BLEU, ROUGE, Exact Match, F1 Score, and Accuracy.

    +

    LightEval provides a comprehensive set of evaluation tasks [HuggingFace, 2024] and metrics [HuggingFace, 2024]. The available tasks span multiple categories and benchmarks including BigBench, MMLU, TruthfulQA, WinoGrande, and HellaSwag. The framework also supports standard NLP evaluation metrics including BLEU, ROUGE, Exact Match, F1 Score, and Accuracy.

    In our case, we choose to evaluate our LLMs on the MMLU econometrics task using zero-shot learning. Hence, we define the task as follows:

    -

    We would like to compare the performance of multiple open source models on the MMLU econometrics task. While we could download and evaluate each model locally, we prefer instead to evaluate them on a remote server to save time and resources. LightEval enables serving the model on a TGI-compatible server/container and then running the evaluation by sending requests to the server [Face, 2024].

    +

    We would like to compare the performance of multiple open source models on the MMLU econometrics task. While we could download and evaluate each model locally, we prefer instead to evaluate them on a remote server to save time and resources. LightEval enables serving the model on a TGI-compatible server/container and then running the evaluation by sending requests to the server [HuggingFace, 2024].

    For that purpose, we can leverage HuggingFace Serverless Inference API [1] and set a configuration file for LightEval as shown below, where <MODEL-ID> is the model identifier on HuggingFace (e.g. meta-llama/Llama-3.2-1B-Instruct) and <HUGGINGFACE-TOKEN> is the user’s HuggingFace API token. Alternatively, you could also pass an URL of a corresponding dedicated inference API if you have one.

    model:
       type: "tgi"
    @@ -1576,17 +1576,17 @@ 

    - + - + - +

    Llama3.2 Instruct

    LLaMA architecture-based pretrained and instruction-tuned generative models

    Llama-3.2-1B-Instruct
    Llama-3.2-3B-Instruct

    [Meta AI, 2024]

    [Meta AI, 2024]

    Qwen2.5 Instruct

    Instruction-tuned LLMs family built by Alibaba Cloud

    Qwen2.5-0.5B-Instruct
    Qwen2.5-1.5B-Instruct
    Qwen2.5-3B-Instruct

    [Face, 2024, Hui et al., 2024, Yang et al., 2024]

    [HuggingFace, 2024, Hui et al., 2024, Yang et al., 2024]

    SmolLM2 Instruct

    Instruction-tuned family of compact language models built by HuggingFace

    SmolLM2-360M-Instruct
    SmolLM2-1.7B-Instruct

    [Allal et al., 2024]

    [Allal et al., 2024]

    @@ -1599,10 +1599,10 @@

    [Hugging Face, 2024]. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the official repository [Fourrier et al., 2023].

    +

    In summary, LightEval is a simple yet flexible and comprehensive framework for evaluating LLMs across a wide variety of tasks and metrics. It can serve as a first step in selecting your next LLM for a specific task given the exponential growth in number of (open source) models available [HuggingFace, 2024]. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the official repository [Fourrier et al., 2023].

    -

    3.9.2. LangSmith

    +

    3.9.2. LangSmith

    Let’s revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recal the setup:

    • Benchmark model: gpt-4o

    • @@ -2010,8 +2010,8 @@

      -

      3.9.3. PromptFoo

      -

      Promptfoo [promptfoo, 2024] is an open-source framework designed for evaluating applications that utilize LLMs. Key features include:

      +

      3.9.3. PromptFoo

      +

      Promptfoo [promptfoo, 2024] is an open-source framework designed for evaluating applications that utilize LLMs. Key features include:

      1. Automated Testing: Promptfoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications.

      2. Custom Probes: Developers can create custom probes to focus on specific use cases for instance decoupling prompts from tests cases.

      3. @@ -2302,7 +2302,7 @@

        Prompt Comparison R

        In conclusion, Promptfoo can serve as an effective LLM application evaluation tool particularly for its ability to decouple several components of the evaluation process. Hence enabling the user to focus on the most important aspects of the evaluation given the particular application and criteria making it a valuable and flexible tool for LLM application development.

    -

    3.9.4. Comparison

    +

    3.9.4. Comparison

    Table 3.6 provides a summarized comparative analysis of three open source frameworks for language models evaluation we have discussed: Lighteval, LangSmith, and Promptfoo. Each framework is assessed based on key features such as integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration.

    @@ -2339,7 +2339,7 @@

    -

    3.10. Conclusion

    +

    3.10. Conclusion

    Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate.

    Success requires embracing this new paradigm by implementing comprehensive evals that cover the non-deterministic generative nature of LLMs - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth.

    The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs in software development.

    @@ -2356,166 +2356,166 @@

    -

    3.11. References

    +

    3.11. References

    -
    +
    [ALB+24]

    Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.

    -
    +
    [Are24]

    Judge Arena. Judge arena: evaluating llm outputs with llms. https://judgearena.com/, 2024. Accessed: 2024.

    -
    +
    [BPS99]

    Sally C. Brailsford, Chris N. Potts, and Barbara M. Smith. Constraint satisfaction problems: algorithms and applications. European Journal of Operational Research, 119(3):557–581, 1999. URL: https://www.sciencedirect.com/science/article/pii/S0377221798003646, doi:https://doi.org/10.1016/S0377-2217(98)00364-6.

    -
    +
    [CTJ+21]

    Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, and Wojciech Zaremba. Evaluating large language models trained on code. 2021. URL: https://arxiv.org/abs/2107.03374, arXiv:2107.03374.

    -
    +
    [CZS+24]

    Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Hao Zhang, Banghua Zhu, Michael Jordan, Joseph E. Gonzalez, and Ion Stoica. Chatbot arena: an open platform for evaluating llms by human preference. 2024. URL: https://arxiv.org/abs/2403.04132, arXiv:2403.04132.

    -
    +
    [Cho24a]

    Francois Chollet. Arc prize 2024 results. ARC Prize Website, 12/08/2024. URL: https://arcprize.org/2024-results.

    -
    +
    [Cho24b]

    Francois Chollet. Abstraction and reasoning challenge. ARC Prize Website, 2024. URL: https://arcprize.org/.

    -
    +
    [DRCW+24]

    Darshan Deshpande, Selvan Sunitha Ravi, Sky CH-Wang, Bartosz Mielczarek, Anand Kannappan, and Rebecca Qian. Glider: grading llm interactions and decisions using explainable ranking. 2024. URL: https://arxiv.org/abs/2412.14140, arXiv:2412.14140.

    -
    +
    [DGLH24]

    Yann Dubois, Balázs Galambosi, Percy Liang, and Tatsunori B. Hashimoto. Length-controlled alpacaeval: a simple way to debias automatic evaluators. 2024. URL: https://arxiv.org/abs/2404.04475, arXiv:2404.04475.

    -
    -[Fac24a] -

    Hugging Face. Available tasks - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Available-Tasks, 2024. Accessed: 2024.

    -
    -
    -[Fac24b] -

    Hugging Face. Evaluate the model on a server or container - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Evaluate-the-model-on-a-server-or-container, 2024. Accessed: 2024.

    -
    -
    -[Fac24c] -

    Hugging Face. Gpt-2 documentation - hugging face transformers. https://huggingface.co/docs/transformers/model_doc/gpt2, 2024. Accessed: 2024.

    -
    -
    -[Fac24d] -

    Hugging Face. Llm as a judge. https://huggingface.co/learn/cookbook/en/llm_judge, 2024. Accessed: 2024.

    -
    -
    -[Fac24e] -

    Hugging Face. Metric list - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Metric-List, 2024. Accessed: 2024.

    -
    -
    -[Fac24f] -

    Hugging Face. Open llm leaderboard. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/open-llm-leaderboard/blog.

    -
    -
    +
    [FHWT23] (1,2)

    Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.

    -
    +
    [GNH+23]

    Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.

    -
    +
    [HBB+21]

    Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. 2021. URL: https://arxiv.org/abs/2009.03300, arXiv:2009.03300.

    -
    +
    [HBD+20]

    Ari Holtzman, Jan Buys, Li Du, Maxwell Forbes, and Yejin Choi. The curious case of neural text degeneration. 2020. URL: https://arxiv.org/abs/1904.09751, arXiv:1904.09751.

    -
    +
    +[Hug24a] +

    HuggingFace. Available tasks - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Available-Tasks, 2024. Accessed: 2024.

    +
    +
    +[Hug24b] +

    HuggingFace. Evaluate the model on a server or container - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Evaluate-the-model-on-a-server-or-container, 2024. Accessed: 2024.

    +
    +
    +[Hug24c] +

    HuggingFace. Gpt-2 documentation - huggingface transformers. https://huggingface.co/docs/transformers/model_doc/gpt2, 2024. Accessed: 2024.

    +
    +
    +[Hug24d] +

    HuggingFace. Llm as a judge. https://huggingface.co/learn/cookbook/en/llm_judge, 2024. Accessed: 2024.

    +
    +
    +[Hug24e] +

    HuggingFace. Metric list - lighteval wiki. https://github.com/huggingface/lighteval/wiki/Metric-List, 2024. Accessed: 2024.

    +
    +
    +[Hug24f] +

    HuggingFace. Open llm leaderboard. HuggingFace Spaces, 2024. URL: https://huggingface.co/spaces/open-llm-leaderboard/blog.

    +
    +
    [HYC+24]

    Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Lei Zhang, Tianyu Liu, Jiajun Zhang, Bowen Yu, Kai Dang, and others. Qwen2.5 - coder technical report. arXiv preprint arXiv:2409.12186, 2024.

    -
    +
    [LXS+24] (1,2,3)

    Zhen Li, Xiaohan Xu, Tao Shen, Can Xu, Jia-Chen Gu, Yuxuan Lai, Chongyang Tao, and Shuai Ma. Leveraging large language models for nlg evaluation: advances and challenges. 2024. URL: https://arxiv.org/abs/2401.07103, arXiv:2401.07103.

    -
    +
    [LBL+23]

    Percy Liang, Rishi Bommasani, Tony Lee, Dimitris Tsipras, Dilara Soylu, Michihiro Yasunaga, Yian Zhang, Deepak Narayanan, Yuhuai Wu, Ananya Kumar, Benjamin Newman, Binhang Yuan, Bobby Yan, Ce Zhang, Christian Cosgrove, Christopher D. Manning, Christopher Ré, Diana Acosta-Navas, Drew A. Hudson, Eric Zelikman, Esin Durmus, Faisal Ladhak, Frieda Rong, Hongyu Ren, Huaxiu Yao, Jue Wang, Keshav Santhanam, Laurel Orr, Lucia Zheng, Mert Yuksekgonul, Mirac Suzgun, Nathan Kim, Neel Guha, Niladri Chatterji, Omar Khattab, Peter Henderson, Qian Huang, Ryan Chi, Sang Michael Xie, Shibani Santurkar, Surya Ganguli, Tatsunori Hashimoto, Thomas Icard, Tianyi Zhang, Vishrav Chaudhary, William Wang, Xuechen Li, Yifan Mai, Yuhui Zhang, and Yuta Koreeda. Holistic evaluation of language models. 2023. URL: https://arxiv.org/abs/2211.09110, arXiv:2211.09110.

    -
    +
    [LBC24]

    Bill Yuchen Lin, Ronan Le Bras, and Yejin Choi. Zebralogic: benchmarking the logical reasoning ability of language models. 2024. URL: https://huggingface.co/spaces/allenai/ZebraLogic.

    -
    +
    [LHE22]

    Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

    -
    +
    [PZWG23]

    Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.

    -
    +
    [pro24]

    promptfoo. Promptfoo: llm testing and evaluation framework. 2024. Open source framework for testing and evaluating LLM prompts. URL: https://www.promptfoo.dev/.

    -
    +
    [Ras24]

    Sebastian Raschka. Build A Large Language Model (From Scratch). Manning, 2024. ISBN 978-1633437166. URL: https://www.manning.com/books/build-a-large-language-model-from-scratch.

    -
    +
    [SLL+24]

    Bhaskarjit Sarmah, Mingshu Li, Jingrao Lyu, Sebastian Frank, Nathalia Castellanos, Stefano Pasquali, and Dhagash Mehta. How to choose a threshold for an evaluation metric for large language models. 2024. URL: https://arxiv.org/abs/2412.12148, arXiv:2412.12148.

    -
    +
    [SRF+24]

    Shivalika Singh, Angelika Romanou, Clémentine Fourrier, David I. Adelani, Jian Gang Ngui, Daniel Vila-Suero, Peerat Limkonchotiwat, Kelly Marchisio, Wei Qi Leong, Yosephine Susanto, Raymond Ng, Shayne Longpre, Wei-Yin Ko, Madeline Smith, Antoine Bosselut, Alice Oh, Andre F. T. Martins, Leshem Choshen, Daphne Ippolito, Enzo Ferrante, Marzieh Fadaee, Beyza Ermis, and Sara Hooker. Global mmlu: understanding and addressing cultural and linguistic biases in multilingual evaluation. 2024. URL: https://arxiv.org/abs/2412.03304, arXiv:2412.03304.

    -
    +
    [SRR+23]

    Aarohi Srivastava, Abhinav Rastogi, Abhishek Rao, Abu Awal Md Shoeb, Abubakar Abid, Adam Fisch, Adam R. Brown, Adam Santoro, Aditya Gupta, Adrià Garriga-Alonso, Agnieszka Kluska, Aitor Lewkowycz, Akshat Agarwal, Alethea Power, Alex Ray, Alex Warstadt, Alexander W. Kocurek, Ali Safaya, Ali Tazarv, Alice Xiang, Alicia Parrish, Allen Nie, Aman Hussain, Amanda Askell, Amanda Dsouza, Ambrose Slone, Ameet Rahane, Anantharaman S. Iyer, Anders Andreassen, Andrea Madotto, Andrea Santilli, Andreas Stuhlmüller, Andrew Dai, Andrew La, Andrew Lampinen, Andy Zou, Angela Jiang, Angelica Chen, Anh Vuong, Animesh Gupta, Anna Gottardi, Antonio Norelli, Anu Venkatesh, Arash Gholamidavoodi, Arfa Tabassum, Arul Menezes, Arun Kirubarajan, Asher Mullokandov, Ashish Sabharwal, Austin Herrick, Avia Efrat, Aykut Erdem, Ayla Karakaş, B. Ryan Roberts, Bao Sheng Loe, Barret Zoph, Bartłomiej Bojanowski, Batuhan Özyurt, Behnam Hedayatnia, Behnam Neyshabur, Benjamin Inden, Benno Stein, Berk Ekmekci, Bill Yuchen Lin, Blake Howald, Bryan Orinion, Cameron Diao, Cameron Dour, Catherine Stinson, Cedrick Argueta, César Ferri Ramírez, Chandan Singh, Charles Rathkopf, Chenlin Meng, Chitta Baral, Chiyu Wu, Chris Callison-Burch, Chris Waites, Christian Voigt, Christopher D. Manning, Christopher Potts, Cindy Ramirez, Clara E. Rivera, Clemencia Siro, Colin Raffel, Courtney Ashcraft, Cristina Garbacea, Damien Sileo, Dan Garrette, Dan Hendrycks, Dan Kilman, Dan Roth, Daniel Freeman, Daniel Khashabi, Daniel Levy, Daniel Moseguí González, Danielle Perszyk, Danny Hernandez, Danqi Chen, Daphne Ippolito, Dar Gilboa, David Dohan, David Drakard, David Jurgens, Debajyoti Datta, Deep Ganguli, Denis Emelin, Denis Kleyko, Deniz Yuret, Derek Chen, Derek Tam, Dieuwke Hupkes, Diganta Misra, Dilyar Buzan, Dimitri Coelho Mollo, Diyi Yang, Dong-Ho Lee, Dylan Schrader, Ekaterina Shutova, Ekin Dogus Cubuk, Elad Segal, Eleanor Hagerman, Elizabeth Barnes, Elizabeth Donoway, Ellie Pavlick, Emanuele Rodola, Emma Lam, Eric Chu, Eric Tang, Erkut Erdem, Ernie Chang, Ethan A. Chi, Ethan Dyer, Ethan Jerzak, Ethan Kim, Eunice Engefu Manyasi, Evgenii Zheltonozhskii, Fanyue Xia, Fatemeh Siar, Fernando Martínez-Plumed, Francesca Happé, Francois Chollet, Frieda Rong, Gaurav Mishra, Genta Indra Winata, Gerard de Melo, Germán Kruszewski, Giambattista Parascandolo, Giorgio Mariani, Gloria Wang, Gonzalo Jaimovitch-López, Gregor Betz, Guy Gur-Ari, Hana Galijasevic, Hannah Kim, Hannah Rashkin, Hannaneh Hajishirzi, Harsh Mehta, Hayden Bogar, Henry Shevlin, Hinrich Schütze, Hiromu Yakura, Hongming Zhang, Hugh Mee Wong, Ian Ng, Isaac Noble, Jaap Jumelet, Jack Geissinger, Jackson Kernion, Jacob Hilton, Jaehoon Lee, Jaime Fernández Fisac, James B. Simon, James Koppel, James Zheng, James Zou, Jan Kocoń, Jana Thompson, Janelle Wingfield, Jared Kaplan, Jarema Radom, Jascha Sohl-Dickstein, Jason Phang, Jason Wei, Jason Yosinski, Jekaterina Novikova, Jelle Bosscher, Jennifer Marsh, Jeremy Kim, Jeroen Taal, Jesse Engel, Jesujoba Alabi, Jiacheng Xu, Jiaming Song, Jillian Tang, Joan Waweru, John Burden, John Miller, John U. Balis, Jonathan Batchelder, Jonathan Berant, Jörg Frohberg, Jos Rozen, Jose Hernandez-Orallo, Joseph Boudeman, Joseph Guerr, Joseph Jones, Joshua B. Tenenbaum, Joshua S. Rule, Joyce Chua, Kamil Kanclerz, Karen Livescu, Karl Krauth, Karthik Gopalakrishnan, Katerina Ignatyeva, Katja Markert, Kaustubh D. Dhole, Kevin Gimpel, Kevin Omondi, Kory Mathewson, Kristen Chiafullo, Ksenia Shkaruta, Kumar Shridhar, Kyle McDonell, Kyle Richardson, Laria Reynolds, Leo Gao, Li Zhang, Liam Dugan, Lianhui Qin, Lidia Contreras-Ochando, Louis-Philippe Morency, Luca Moschella, Lucas Lam, Lucy Noble, Ludwig Schmidt, Luheng He, Luis Oliveros Colón, Luke Metz, Lütfi Kerem Şenel, Maarten Bosma, Maarten Sap, Maartje ter Hoeve, Maheen Farooqi, Manaal Faruqui, Mantas Mazeika, Marco Baturan, Marco Marelli, Marco Maru, Maria Jose Ramírez Quintana, Marie Tolkiehn, Mario Giulianelli, Martha Lewis, Martin Potthast, Matthew L. Leavitt, Matthias Hagen, Mátyás Schubert, Medina Orduna Baitemirova, Melody Arnaud, Melvin McElrath, Michael A. Yee, Michael Cohen, Michael Gu, Michael Ivanitskiy, Michael Starritt, Michael Strube, Michał Swędrowski, Michele Bevilacqua, Michihiro Yasunaga, Mihir Kale, Mike Cain, Mimee Xu, Mirac Suzgun, Mitch Walker, Mo Tiwari, Mohit Bansal, Moin Aminnaseri, Mor Geva, Mozhdeh Gheini, Mukund Varma T, Nanyun Peng, Nathan A. Chi, Nayeon Lee, Neta Gur-Ari Krakover, Nicholas Cameron, Nicholas Roberts, Nick Doiron, Nicole Martinez, Nikita Nangia, Niklas Deckers, Niklas Muennighoff, Nitish Shirish Keskar, Niveditha S. Iyer, Noah Constant, Noah Fiedel, Nuan Wen, Oliver Zhang, Omar Agha, Omar Elbaghdadi, Omer Levy, Owain Evans, Pablo Antonio Moreno Casares, Parth Doshi, Pascale Fung, Paul Pu Liang, Paul Vicol, Pegah Alipoormolabashi, Peiyuan Liao, Percy Liang, Peter Chang, Peter Eckersley, Phu Mon Htut, Pinyu Hwang, Piotr Miłkowski, Piyush Patil, Pouya Pezeshkpour, Priti Oli, Qiaozhu Mei, Qing Lyu, Qinlang Chen, Rabin Banjade, Rachel Etta Rudolph, Raefer Gabriel, Rahel Habacker, Ramon Risco, Raphaël Millière, Rhythm Garg, Richard Barnes, Rif A. Saurous, Riku Arakawa, Robbe Raymaekers, Robert Frank, Rohan Sikand, Roman Novak, Roman Sitelew, Ronan LeBras, Rosanne Liu, Rowan Jacobs, Rui Zhang, Ruslan Salakhutdinov, Ryan Chi, Ryan Lee, Ryan Stovall, Ryan Teehan, Rylan Yang, Sahib Singh, Saif M. Mohammad, Sajant Anand, Sam Dillavou, Sam Shleifer, Sam Wiseman, Samuel Gruetter, Samuel R. Bowman, Samuel S. Schoenholz, Sanghyun Han, Sanjeev Kwatra, Sarah A. Rous, Sarik Ghazarian, Sayan Ghosh, Sean Casey, Sebastian Bischoff, Sebastian Gehrmann, Sebastian Schuster, Sepideh Sadeghi, Shadi Hamdan, Sharon Zhou, Shashank Srivastava, Sherry Shi, Shikhar Singh, Shima Asaadi, Shixiang Shane Gu, Shubh Pachchigar, Shubham Toshniwal, Shyam Upadhyay, Shyamolima, Debnath, Siamak Shakeri, Simon Thormeyer, Simone Melzi, Siva Reddy, Sneha Priscilla Makini, Soo-Hwan Lee, Spencer Torene, Sriharsha Hatwar, Stanislas Dehaene, Stefan Divic, Stefano Ermon, Stella Biderman, Stephanie Lin, Stephen Prasad, Steven T. Piantadosi, Stuart M. Shieber, Summer Misherghi, Svetlana Kiritchenko, Swaroop Mishra, Tal Linzen, Tal Schuster, Tao Li, Tao Yu, Tariq Ali, Tatsu Hashimoto, Te-Lin Wu, Théo Desbordes, Theodore Rothschild, Thomas Phan, Tianle Wang, Tiberius Nkinyili, Timo Schick, Timofei Kornev, Titus Tunduny, Tobias Gerstenberg, Trenton Chang, Trishala Neeraj, Tushar Khot, Tyler Shultz, Uri Shaham, Vedant Misra, Vera Demberg, Victoria Nyamai, Vikas Raunak, Vinay Ramasesh, Vinay Uday Prabhu, Vishakh Padmakumar, Vivek Srikumar, William Fedus, William Saunders, William Zhang, Wout Vossen, Xiang Ren, Xiaoyu Tong, Xinran Zhao, Xinyi Wu, Xudong Shen, Yadollah Yaghoobzadeh, Yair Lakretz, Yangqiu Song, Yasaman Bahri, Yejin Choi, Yichi Yang, Yiding Hao, Yifu Chen, Yonatan Belinkov, Yu Hou, Yufang Hou, Yuntao Bai, Zachary Seid, Zhuoye Zhao, Zijian Wang, Zijie J. Wang, Zirui Wang, and Ziyi Wu. Beyond the imitation game: quantifying and extrapolating the capabilities of language models. 2023. URL: https://arxiv.org/abs/2206.04615, arXiv:2206.04615.

    -
    +
    [WPN+19]

    Alex Wang, Yada Pruksachatkun, Nikita Nangia, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Superglue: a stickier benchmark for general-purpose language understanding systems. Advances in Neural Information Processing Systems, 2019.

    -
    +
    [WSM+19]

    Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Glue: a multi-task benchmark and analysis platform for natural language understanding. 2019. URL: https://arxiv.org/abs/1804.07461, arXiv:1804.07461.

    -
    +
    [WTB+22] (1,2)

    Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, Ed H. Chi, Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus. Emergent abilities of large language models. 2022. URL: https://arxiv.org/abs/2206.07682, arXiv:2206.07682.

    -
    +
    [WDR+24]

    Colin White, Samuel Dooley, Manley Roberts, Arka Pal, Ben Feuer, Siddhartha Jain, Ravid Shwartz-Ziv, Neel Jain, Khalid Saifullah, Siddartha Naidu, Chinmay Hegde, Yann LeCun, Tom Goldstein, Willie Neiswanger, and Micah Goldblum. Livebench: a challenging, contamination-free llm benchmark. 2024. URL: https://arxiv.org/abs/2406.19314, arXiv:2406.19314.

    -
    +
    [YYH+24]

    An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zhihao Fan. Qwen2 technical report. arXiv preprint arXiv:2407.10671, 2024.

    -
    +
    [ZCL24]

    Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.

    -
    +
    [ZCS+23]

    Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. Judging llm-as-a-judge with mt-bench and chatbot arena. 2023. URL: https://arxiv.org/abs/2306.05685, arXiv:2306.05685.

    -
    +
    [HuggingFace24] -

    Hugging Face. Number of models on hugging face. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024?day=4, 2024. Accessed: 12/06/2024.

    +

    HuggingFace. Number of models on huggingface. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024?day=4, 2024. Accessed: 12/06/2024.

    -
    +
    [MetaAI24] -

    Meta AI. Meta llama models on hugging face. https://huggingface.co/meta-llama, 2024. Accessed: 2024.

    +

    Meta AI. Meta llama models on huggingface. https://huggingface.co/meta-llama, 2024. Accessed: 2024.

    diff --git a/tamingllms/_build/html/notebooks/input.html b/tamingllms/_build/html/notebooks/input.html index 1295768..1c6c6d3 100644 --- a/tamingllms/_build/html/notebooks/input.html +++ b/tamingllms/_build/html/notebooks/input.html @@ -171,6 +171,8 @@
  • Retrieval-Augmented Generation
  • +
  • A Note on Frameworks
  • +
  • Case Studies
  • Conclusion
  • @@ -250,7 +252,7 @@
    -

    5. Managing Input Data

    +

    5. Managing Input Data

    One home run is much better than two doubles.

    —Steve Jobs

    @@ -258,68 +260,81 @@ -
    -

    Note

    -

    This Chapter is Work-in-Progress.

    -
    -

    5.1. Introduction

    -

    Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) [Lee et al., 2024] have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs.

    -

    LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results [Tan et al., 2024]. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy [Amayuelas et al., 2024]. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge [Kotha et al., 2024].

    -

    Motivated by these challenges, this chapter explores two key components:

    +

    5.1. Introduction

    +

    While advances in long-context language models (LCs) [Lee et al., 2024] have expanded the amount of information these systems can process, significant challenges remain in managing and effectively utilizing extended data inputs:

    +
      +
    • LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results [He et al., 2024, Liu et al., 2024, Tan et al., 2024].

    • +
    • They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy [Amayuelas et al., 2024].

    • +
    • LLMs also face “lost-in-the-middle” problems [Wu et al., 2024] and struggle with less common but important information showing a systematic loss of long-tail knowledge [Kotha et al., 2024].

    • +
    +

    Motivated by these challenges, this chapter explores two key input data components:

      -
    1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.

    2. +
    3. Data Parsing and Chunking: Parsing and chunking documents into a unified format that is suitable and more manageable for LLMs to process.

    4. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.

    In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.

    -

    In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.

    -

    In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.

    +

    In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation) while discussing whether RAGs will be really needed in the future given the rise of long-context language models.

    +

    While RAGs are useful for incorporating external context, they are not a silver bullet nor a mandatory component for all LLM applications. In our last case study, we leverage long-context windows to build a quiz generator from a large knowledge base. We will also explore some additional relevant techniques such as prompt caching and response verification through citations.

    By the chapter’s conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases.

    -

    5.2. Parsing Documents

    -

    Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.

    -

    We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM’s performance.

    +

    5.2. Parsing Documents

    +

    Data parsing and formatting play a critical role in LLMs performance [He et al., 2024, Liu et al., 2024, Tan et al., 2024]. Hence, building robust data ingestion and preprocessing pipelines is essential for any LLM application.

    +

    This section explores open source tools that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the LLM performance.

    +

    We will cover open source tools that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs demonstrating how the quality of the parser can impact LLM’s performance.

    -

    5.2.1. MarkItDown

    -

    MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.

    +

    5.2.1. MarkItDown

    +

    MarkItDown [Microsoft, 2024] is a Python package and CLI tool developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.

    Key features:

    • Simple command-line and Python API interfaces

    • @@ -338,8 +353,8 @@

      -

      5.2.2. Docling

      -

      Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.

      +

      5.2.2. Docling

      +

      Docling [IBM Research, 2024] is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.

      Key features:

    -

    5.2.4. Structured Data Extraction

    -

    A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch’s CIO Capital Market Outlook released on December 16, 2024 [Merrill Lynch, 2024]. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see Fig. 5.1)

    +

    5.2.3. Structured Data Extraction

    +

    A common use case where document parsing matters is structured data extraction, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch’s CIO Capital Market Outlook released on December 16, 2024 [Merrill Lynch, 2024]. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see Fig. 5.1).

    -Forecast +Forecast
    -

    Fig. 5.1 Forecast

    +

    Fig. 5.1 Merrill Lynch’s CIO Capital Market Outlook released on December 16, 2024 [Merrill Lynch, 2024]

    @@ -399,7 +411,7 @@

    SequenceMatcher from the difflib package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence.

    +

    How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the SequenceMatcher from the difflib package, which is a simple measure of similarity between two strings based on the number of matches in the longest common subsequence.

    import Levenshtein
    @@ -447,7 +459,7 @@ 

    SequenceMatcher respectively.

    +

    It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and SequenceMatcher, respectively.

    Docling’s result is a quite readable markdown displaying key economic variables and their forecasts. Conversely, MarkItDown’s result is a bit messy and hard to read but the information is there just not in a structured format. Does it matter? That’s what we will explore next.

    Docling’s result

    @@ -457,11 +469,11 @@

    Fig. 5.2 shows part of the parsed result from Docling.

    -
    +

    Fig. 5.2 shows part of the parsed result from Docling.

    +
    Docling's result
    -

    Fig. 5.2 Docling’s parsed result

    +

    Fig. 5.2 Docling’s parsed result

    MarkItDown’s result

    @@ -473,18 +485,18 @@

    Fig. 5.3 shows part of the parsed result from MarkItDown.

    -
    +

    Fig. 5.3 shows part of the parsed result from MarkItDown.

    +
    MarkItDown's parsed result
    -

    Fig. 5.3 MarkItDown’s parsed result

    +

    Fig. 5.3 MarkItDown’s parsed result

    Now, let’s focus on the economic forecasts. In particular, we are interested in extracting the CIO’s 2025E forecasts.

    Forecast 2025
    -

    Fig. 5.4 Forecast 2025

    +

    Fig. 5.4 Merrill Lynch’s CIO Economic Forecasts.

    We will define a Forecast pydantic model to represent an economic forecast composed of a financial_variable and a financial_forecast. We will also define a EconForecast pydantic model to represent the list of economic forecasts we want to extract from the document.

    @@ -500,7 +512,7 @@

    extract_prompt is kind of data the user would like to extract and doc is the input document to analyze.

    +

    We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where extract_prompt represents the kind of data the user would like to extract and doc is the input document to analyze.

    BASE_PROMPT = f"""
         ROLE: You are an expert at structured data extraction. 
         TASK: Extract the following data {extract_prompt} from input DOCUMENT
    @@ -721,7 +733,7 @@ 

    Asset Class Weightings. The CIO view information is represented in a spectrum starting with “Underweight”, passing through “Neutral” and reaching “Overweight”. The actual view is marked by some colored dots in the chart. Let’s see if we can extract this relatively more complex information from the document.

    Asset Class Weightings
    @@ -748,7 +760,7 @@

    # Create DataFrame with specified columns
    @@ -916,10 +928,10 @@ 

    tables attribute of the DocumentConverter object.

    -

    By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document. -Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.

    +

    By doing that, we observe that Docling extracted 7 tables from the document exporting tables from top down and left to right in order of appearance in the document. +Below, we display the first two and the last tables. We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.

    -

    5.3. Retrieval-Augmented Generation

    -

    RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks [Lewis et al., 2021].

    -

    RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs [Ni et al., 2024, Zhou et al., 2024].

    +

    5.3. Retrieval-Augmented Generation

    +

    What happens if we asked ChatGPT who’s the author of the book “Taming LLMs”?

    +
    +
    +
    from dotenv import load_dotenv
    +import os
    +
    +# Load environment variables from .env file
    +load_dotenv()
    +
    +from openai import OpenAI
    +client = OpenAI()
    +model = "gpt-4o-mini"
    +
    +
    +
    +
    +
    +
    +
    question = "Who's the Author of the Book Taming LLMs?"
    +
    +
    +
    +
    +
    +
    +
    response = client.chat.completions.parse(
    +    model="gpt-4o-mini",
    +    messages=[
    +        {"role": "user", "content": question}
    +    ]
    +)
    +response.choices[0].message.content
    +
    +
    +
    +
    +
    The book "Taming LLMs" is authored by *G. Arulkumaran, H. M. B. P. D. Karthikeyan, and I. A. M. Almasri.* If you need more information about the book or its contents, feel free to ask!
    +
    +
    +
    +
    +

    Turns out ChatGPT hallucinates. A quick web search on the before mentioned authors yields no results. In fact, those authors names are made up. And of course the correct answer would have been “Tharsis Souza”.

    +

    LLMs only have access to the information they have been trained on, which of course has been fixed at a point in time. Hence, LLMs operate with stale data. The problem gets exacerbated by the fact that LLMs are trained to provide an answer even if the answer is unknown by them, hence leading to hallucinations.

    +

    One solution to this problem is to use a retrieval system to fetch information from a knowledge base to provide recent and relevant context to user queries using so-called Retrieval Augmented Generation (RAG) system.

    +

    RAG utilizes a retrieval system to fetch external knowledge and augment LLM’s context. It is a useful technique for building LLM applications that require domain-specific information or knowledge-intensive tasks [Lewis et al., 2021]. It has also proved effective in mitigating LLMs hallucinations [Ni et al., 2024, Zhou et al., 2024].

    +

    In the above example, a RAG would help with hallucinations by grounding the LLM’s response to information provided in the knowledge base. Additional common use cases of RAG systems include:

    +
      +
    1. Enterprise Knowledge Management: RAG enables organizations to synthesize answers from diverse internal data sources like documents, databases, and communication channels. This creates a unified knowledge interface that can accurately answer questions using the organization’s own data.

    2. +
    3. Document Processing and Analysis: RAG excels at extracting and analyzing information from complex documents like financial reports, presentations, and spreadsheets. The system can enable LLMs to understand context and relationships across different document types and formats.

    4. +
    5. Intelligent Customer Support: By combining knowledge bases with conversational abilities, RAG powers chatbots and support systems that can maintain context across chat history, provide accurate responses, and handle complex customer queries while reducing hallucinations.

    6. +
    7. Domain-Specific Applications: RAG allows LLMs to be equipped with specialized knowledge in fields like medicine, law, or engineering by retrieving information from domain-specific literature, regulations, and technical documentation. This enables accurate responses aligned with professional standards and current best practices.

    8. +
    9. Code Documentation and Technical Support: RAG can help developers by retrieving relevant code examples, API documentation, and best practices from repositories and documentation, which often suffer updates frequently, enabling more accurate and contextual coding assistance.

    10. +
    +

    If LLMs alone work on stale, general-purpose data with the added challenge of being prone to hallucinations, RAG systems serve as an added capability enabling LLMs to work on recent, domain-specific knowledge increasing the likelihood of LLMs to provide responses that are factual and relevant to user queries.

    +
    +

    5.3.1. RAG Pipeline

    +

    RAG architectures vary but they all share the same goal: to retrieve relevant information from a knowledge base to maximize the LLM’s ability to effectively and accurately respond to prompts, particularly when the answer requires out-of-training data information.

    +

    We will introduce key components of a RAG system one by one leading to a full canonical RAG pipeline at the end that ultimately will be used to answer our original question “Who’s the author of the book Taming LLMs?”, accurately.

    +

    The following basic components will be introduced (see Fig. 5.6 for a visual representation):

    +
      +
    • Vector Database

      +
        +
      • Embeddings

      • +
      • Indexing

      • +
      +
    • +
    • Retrieval System including re-ranking

    • +
    • LLM Augmented Generation via in-context learning

    • +
    +

    Data extraction, parsing and chunking are also part of a canonical pipeline as we prepare the knowledge base. Those are concepts that we have already explored in the previous sections, hence we will be succinct here. We will start by preparing the knowledge base.

    +
    +RAG Pipeline +
    +

    Fig. 5.6 Simplified RAG Pipeline

    +
    +
    +
    +

    5.3.1.1. Preparing the Knowledge Base

    +

    Every RAG system requires a knowledge base. In our case, the knowledge base is a set of documents that we equip the LLM to answer our authorship question.

    +

    Hence, we will compose our knowledge base by adding the web version of (some of the chapters of) the book “Taming LLMs”, namely:

    +
      +
    • Introduction

    • +
    • Structured Output

    • +
    • Input (this very chapter)

    • +
    +
    +
    +
    book_url = "https://www.tamingllms.com/"
    +chapters = ["markdown/intro.html",
    +            "notebooks/structured_output.html",
    +            "notebooks/input.html"]
    +
    +chapter_urls = [f"{book_url}/{chapter}" for chapter in chapters]
    +chapter_ids = [chapter.split("/")[-1].replace(".html", "") for chapter in chapters]
    +
    +
    +
    +
    +

    We use Docling to download the chapters from the web and parse them as markdown files.

    +
    +
    +
    chapters = [converter.convert(chapter_url).document.export_to_markdown() for chapter_url in chapter_urls]
    +
    +
    +
    +
    +

    Now we are ready to store the chapters in a vector database to enable the construction of a retrieval system.

    +
    +
    +

    5.3.1.2. Vector Database

    +

    Vector databases are specialized databases designed to store and retrieve high-dimensional vectors, which are mathematical representations of data like text, images, or audio. These databases are optimized for similarity search operations, making them ideal for embeddings-based retrieval systems.

    +

    A typical pipeline involving a vector database includes the following:

    +
      +
    1. Input data is converted into “documents” forming a collection representing our knowledge base

    2. +
    3. Each document is converted into an embedding which are stored in the vector database

    4. +
    5. Embeddings are indexed in the vector database for efficient similarity search

    6. +
    7. The vector database is queried to retrieve the most relevant documents

    8. +
    9. The retrieved documents are used to answer questions

    10. +
    +

    Vector databases are not a mandatory component of RAG systems. In fact, we can use a simple list of strings to store the chapters (or their chunks) and then use the LLM to answer questions about the document. However, vector databases are useful for RAG applications as they enable:

    +
      +
    • Fast similarity search for finding relevant context

    • +
    • Efficient storage of document embeddings

    • +
    • Scalable retrieval for large document collections

    • +
    • Flexible querying with metadata filters

    • +
    +

    In that way, RAG applications can be seen as a retrieval system that uses a vector database to store and retrieve embeddings of documents, which in turn are used to augment LLMs with contextually relevant information as we will see in the next sections.

    +

    Here, we will use ChromaDB [ChromaDB, 2024b] as an example of an open source vector database but key features and concepts we cover are applicable to other vector databases, in general.

    +

    ChromaDB is a popular open-source vector database that offers:

    +
      +
    • Efficient storage and retrieval of embeddings

    • +
    • Support for metadata and filtering

    • +
    • Easy integration with Python applications

    • +
    • In-memory and persistent storage options

    • +
    • Support for multiple distance metrics

    • +
    +

    Other notable vector databases include Weaviate, FAISS, and Milvus.

    +

    In ChromaDB, we can create a vector database client as follows.

    +
    +
    +
    import chromadb
    +chroma_client = chromadb.Client()
    +
    +
    +
    +
    +

    This will create a vector database in memory. We can also create a persistent vector database by specifying a path to a directory or alternatively by using a cloud-based vector database service like AWS, Azure or GCP. We will use a vector database in memory for this example.

    +

    Next, we create a collection to store the embeddings of the chapters. And add our chapters as documents to the collection as follows.

    +
    +
    +
    collection = chroma_client.create_collection(name="taming_llms")
    +
    +collection.add(
    +    documents=chapters,
    +    ids=chapter_ids
    +)
    +
    +
    +
    +
    +

    We are ready to query the collection. We write a simple function that takes the collection, input query and number of retrieved results as argument and returns the retrieved documents.

    +
    +
    +
    def query_collection(collection, query_text, n_results=3):
    +    results = collection.query(
    +        query_texts=[query_text],
    +        n_results=n_results
    +    )
    +    return results
    +
    +
    +
    +
    +

    We write a simple query, enquiring the purpose of the book.

    +
    +
    +
    q = "What is the purpose of this book?"
    +res = query_collection(collection, q)
    +res.get("ids")
    +
    +
    +
    +
    +
    +
    +
    print([['intro', 'input', 'structured_output']])
    +
    +
    +
    +
    +

    As response, we obtain an object that contains several attributes including:

    +
      +
    • documents: The actual documents retrieved from the collection, i.e. the chapters

    • +
    • ids: The ids of the documents retrieved from the collection

    • +
    • distances: The distances of the documents to the query vector

    • +
    +

    We can see that the chapters “Introduction”, “Input” and “Structured Output” are retrieved from the collection ordered by their distance to the query vector.

    +

    We observe that the Introduction chapter is the most relevant one as it ranks first, followed by the Input and Structured Output chapters. Indeed, the purpose of the book is included in the Introduction chapter demonstrating the retrieval system successfully retrieved the most relevant document to the input query, in this simple example.

    +

    In order to understand how the retrieval system works and how the “distance to the query vector” is computed, we need to understand how the embeddings are created and how the documents are indexed.

    +

    Embeddings

    +

    Embeddings are numerical representations of data (including text, images, audio, etc.) that capture meaning, allowing machines to process data quantitatively. Each embedding can be represented as a vector of floating-point numbers such that embedded data with similar meanings produce similar, i.e. close, vectors [1].

    +

    For text data, small distances among embeddings suggest high semantic relatedness and large distances suggest low semantic relatedness among the embedded texts. HuggingFace provides a leaderboard of embeddings models [HuggingFace, 2024i], which are ranked by in dimensions such as classification, clustering and reranking performance.

    +

    Behind the scenes, ChromaDB is using the model all-MiniLM-L6-v2 by default [2] to create embeddings for the input documents and the query (see Fig. 5.7). This model is available in sentence_transformers [HuggingFace, 2024f]. Let’s see how it works.

    +
    +Embedding +
    +

    Fig. 5.7 Embedding

    +
    +
    +
    +
    +
    from sentence_transformers import SentenceTransformer
    +
    +embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    +
    +
    +
    +
    +

    We replicate what ChromaDB did by embedding our chapters as well as input query using sentence transformers.

    +
    +
    +
    q = "What is the purpose of this book?"
    +docs_to_embed = [q] + chapters
    +embeddings = embedding_model.encode(docs_to_embed)
    +print(embeddings.shape)
    +
    +
    +
    +
    +
    (4, 384)
    +
    +
    +
    +
    +

    As a result, we obtain four 384-dimensional vectors representing our embeddings (one for each of the three chapters and one for the input query).

    +

    Now we can calculate similarity among the embeddings. By default, sentence transformers uses cosine similarity to calculate the similarity between embeddings.

    +
    +
    +
    similarities = embedding_model.similarity(embeddings, embeddings)
    +similarities
    +
    +
    +
    +
    +
    tensor([[1.0000, 0.4402, 0.3022, 0.4028],
    +        [0.4402, 1.0000, 0.6606, 0.5807],
    +        [0.3022, 0.6606, 1.0000, 0.6313],
    +        [0.4028, 0.5807, 0.6313, 1.0000]])
    +
    +
    +

    Let’s visualize the similarity matrix to better understand the relationships between our documents in Fig. 5.8. The top row of the matrix represents the similarity of the input query against all chapters. That’s exactly what we previously obtained by querying ChromaDB which returned a response with documents ranked by similarity to input query.

    +
    +Similarity matrix heatmap +
    +

    Fig. 5.8 Similarity matrix heatmap showing relationships among query and chapters.

    +
    +
    +

    Calculating similarity among embeddings can become computationally intensive if brute force is used, i.e. pair-wise computation, as the number of documents grows in the knowledge base. Indexing is a technique to help address this challenge.

    +

    Indexing

    +

    Indexing is a crucial optimization technique that makes similarity searches faster and more efficient.

    +

    Without indexing, finding similar vectors would require an exhaustive search - comparing a query vector against every single vector in the database. For large datasets, this becomes prohibitively slow.

    +

    Common indexing strategies include:

    +
      +
    1. Tree-based Indexes

      +
        +
      • Examples include KD-trees and Ball trees

      • +
      • Work by partitioning the vector space into hierarchical regions

      • +
      • Effective for low-dimensional data but suffer from the “curse of dimensionality”

      • +
      +
    2. +
    3. Graph-based Indexes

      +
        +
      • HNSW (Hierarchical Navigable Small World) is a prominent example

      • +
      • Creates a multi-layered graph structure for navigation

      • +
      • Offers excellent search speed but requires more memory

      • +
      +
    4. +
    5. LSH (Locality-Sensitive Hashing)

      +
        +
      • Uses hash functions that map similar vectors to the same buckets

      • +
      • More memory-efficient than graph-based methods

      • +
      • May sacrifice some accuracy for performance

      • +
      +
    6. +
    7. Quantization-based Indexes

      +
        +
      • Product Quantization compresses vectors by encoding them into discrete values

      • +
      • Reduces memory footprint significantly

      • +
      • Good balance between accuracy and resource usage

      • +
      +
    8. +
    +

    HNSW is the underlying library for Chroma vector indexing and search [ChromaDB, 2024a]. HNSW provides fast searches with high accuracy but uses more memory. LSH and quantization methods offer better memory efficiency but may sacrifice some precision.

    +

    But are indexing + basic embeddings based similarity sufficient? Often not, as we will see next as we cover reranking technique.

    +
    +
    +

    5.3.1.3. Reranking

    +

    Let’s go back to querying our vector database. Here are additional examples.

    +

    First, we write a query about how to get structured output from LLMs. Successfully retrieving the “Structured Output” chapter from the book as top result.

    +
    +
    +
    q = "How to get structured output from LLMs?"
    +res = query_collection(collection, q)
    +res.get("ids")
    +
    +
    +
    +
    +
    [['structured_output', 'input', 'intro']]
    +
    +
    +
    +
    +

    Next, we would like to obtain a tutorial on Docling, a tool we covered in this very chapter. However, we fail to obtain the correct chapter and instead obtain the “Introduction” chapter as a result.

    +
    +
    +
    q = "Docling tutorial"
    +res = query_collection(collection, q)
    +res.get("ids")
    +
    +
    +
    +
    +
    [['intro', 'input', 'structured_output']]
    +
    +
    +
    +
    +

    Retrieval systems solely based on vector similarity search might miss semantic relevance. That brings the need for techniques that can improve accuracy of the retrieval system. One such technique is re-ranking.

    +

    Re-ranking is a method that can improve accuracy of the retrieval system by re-ranking the retrieved documents based on their relevance to the input query.

    +

    In the following, we will use the sentence_transformers library to re-rank the retrieved documents based on their relevance to the input query. We utilize the CrossEncoder model to re-rank the documents. Cross-Encoder models are more accurate at judging relevance at the cost of speed compared to basic vector-based similarity.

    +

    We can implement a reranking step in a RAG system using a Cross-Encoder model in the following steps:

    +
      +
    1. First, we initialize the Cross-Encoder model:

    2. +
    +
    model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
    +
    +
    +
      +
    • Uses the ms-marco-MiniLM-L-6-v2 model, which is specifically trained for passage reranking

    • +
    • Sets a maximum sequence length of 512 tokens

    • +
    • This model is designed to score the relevance between query-document pairs

    • +
    +
      +
    1. Then we perform the reranking:

    2. +
    +
    scores = model.predict([(q, doc) for doc in res["documents"][0]])
    +
    +
    +
      +
    • Creates pairs of (query, document) for each retrieved document

    • +
    • The model predicts relevance scores for each pair

    • +
    • Higher scores indicate better semantic match between query and document

    • +
    +
      +
    1. Finally, we select the best match:

    2. +
    +
    print(res["documents"][0][np.argmax(scores)])
    +
    +
    +
      +
    • np.argmax(scores) finds the index of the highest scoring document

    • +
    • Uses that index to retrieve the most relevant document

    • +
    +

    We obtain the following scores for the retrieved documents (“intro”, “input”, “structured_output”), the higher the score, the more relevant the document is in relation to the input query.

    +
    array([-8.52623 , -6.328738, -8.750055], dtype=float32)
    +
    +
    +

    As a result, we obtain the index of the highest scoring document, which corresponds to the “input” chapter. Hence, the re-ranking step successfully retrieved the correct chapter.

    +
    +
    +
    print(res["ids"][0][np.argmax(scores)])
    +
    +
    +
    +
    +
    input
    +
    +
    +
    +
    +

    The ideia is to first run semantic similarity on embeddings, which should be fast but potentially inaccurate, and then run re-raking on the top-k results, which is more accurate but slower. By doing so, we can balance the speed and accuracy of the retrieval system.

    +

    Hence, instead of going over all retrieved documents:

    +
    scores = model.predict([(q, doc) for doc in res["documents"][0]])
    +
    +
    +

    We would run reranking on the TOPK results, where TOPK <<< number of documents:

    +
    scores = model.predict([(q, doc) for doc in res["documents"][0][:TOPK]])
    +
    +
    +
    +
    +

    5.3.1.4. LLMs with RAG

    +

    We are finally ready to use the retrieval system to help the LLM answer our authorship question. A common way to integrate RAGs with LLMs is via in-context learning. With in-context learning the LLM learns from the retrieved documents by providing them in the context window as represented in Fig. 5.9. This is accomplished via a prompt template structure as follows.

    +
    +In-Context Learning +
    +

    Fig. 5.9 RAG LLM with In-Context Learning

    +
    +
    +
     rag_system_prompt_template = f"""
    + You are a helpful assistant that answers questions based on the provided CONTEXT.
    +
    + CONTEXT: {context}
    + """
    +
    + user_prompt_template = f"""
    + QUESTION: {input}
    + """
    +
    +
    +

    This prompt strategy demonstrates a common in-context learning pattern where retrieved documents are incorporated into the LLM’s context to enhance response accuracy and relevance. The prompt structure typically consists of a system prompt that:

    +
      +
    • Sets clear boundaries for the LLM to use information from the provided context

    • +
    • Includes the retrieved documents as context

    • +
    +

    This approach:

    +
      +
    • Reduces hallucination by grounding responses in source documents

    • +
    • Improves answer relevance by providing contextually relevant information to the LLM

    • +
    +

    The context variable is typically populated with the highest-scoring document(s) from the retrieval step, while the input variable contains the user’s original query.

    +
    +
    +
    def RAG_qa(client, model, context, input):
    +    """
    +    Generate a summary of input using a given model
    +    """
    +    rag_system_prompt_template =  f"""You are a helpful assistant that answers questions based on the provided CONTEXT.
    +
    +    CONTEXT: {context}
    +    """
    +    
    +    response = client.chat.completions.create(
    +    model=model,
    +        messages=[{"role": "system", "content": rag_system_prompt_template},
    +                 {"role": "user", "content": f"QUESTION: {input}"}]
    +    )
    +    return response.choices[0].message.content
    +
    +
    +
    +
    +

    First, we set the LLM.

    +
    +
    +
    from dotenv import load_dotenv
    +import os
    +
    +# Load environment variables from .env file
    +load_dotenv()
    +
    +from openai import OpenAI
    +client = OpenAI()
    +model = "gpt-4o-mini"
    +
    +
    +
    +
    +

    Then, we run the retrieve step.

    +
    +
    +
    res = query_collection(collection, q)
    +
    +
    +
    +
    +

    Next, we run the re-ranking step setting it to consider the TOPK retrieved documents.

    +
    +
    +
    TOPK = 2
    +scores = model.predict([(q, doc) for doc in res["documents"][0][:TOPK]])
    +res_reranked = res["documents"][0][np.argmax(scores)]
    +
    +
    +
    +
    +

    We then pass the top document as context and invoke the LLM with our RAG-based template leading to a successful response.

    +
    +
    +
    answer = RAG_qa(model, res_reranked[0], question)
    +answer
    +
    +
    +
    +
    +
    The author of the book "Taming LLMs" is Tharsis Souza.
    +
    +
    +
    +
    +

    In this section, we motivated the use of RAGs as a tool to equip LLMs with relevant context and provided a canonical implementation of its core components. RAGs, however, can be implemented in many shapes and forms and entire books have been written about them. We point the user to additional resources if more specialized techniques and architectures are needed [Alammar and Grootendorst, 2024, Diamant, 2024, Kimothi, 2024, AthinaAI, 2024].

    +

    Next, we discuss RAGs challenges and limitations and conclude our RAGs section envisioning the future of RAGs challenged by the rise of long-context language models.

    +
    +
    +
    +

    5.3.2. Challenges and Limitations

    +

    While RAG systems offer powerful capabilities for enhancing LLM responses with external knowledge, they face several significant challenges and limitations that require careful consideration:

    +
      +
    • Data Quality and Accuracy: The effectiveness of RAG systems fundamentally depends on the quality and reliability of their knowledge sources. When these sources contain inaccurate, outdated, biased, or incomplete information, the system’s responses become unreliable. This challenge is particularly acute when dealing with rapidly evolving topics or when sourcing information from unverified channels.

    • +
    • Computational Cost and Latency: Implementing RAG systems at scale presents computational and operational challenges. The process of embedding documents, maintaining vector databases, and performing similarity searches across large knowledge bases demands computational, budget and operational resources. In real-time applications, these requirements can introduce noticeable latency, potentially degrading the user experience and limiting practical applications.

    • +
    • Explainability and Evaluation: The complexity of RAG systems, arising from the intricate interaction between retrieval mechanisms and generative models, makes it difficult to trace and explain their reasoning processes. Traditional evaluation metrics often fail to capture the nuanced aspects of RAG performance, such as contextual relevance and factual consistency. This limitation hampers both system improvement and stakeholder trust. Readers are encouraged to read Chapter The Evals Gap for general LLM evaluation issues as well as consider tools such as Ragas [Ragas, 2024] for RAG evaluation.

    • +
    • Hallucination Management: Though RAG systems help ground LLM responses in source documents, they do not completely eliminate hallucinations. The generative component may still produce content that extrapolates beyond or misinterprets the retrieved context. This risk becomes particularly concerning when the system confidently presents incorrect information with apparent source attribution.

    • +
    +

    Moreover, recent research has shed light on critical limitations of key techniques used in RAGs systems. A relevant finding pertains to reranking, which has shown [Jacob et al., 2024]:

    +
      +
    • Diminishing Returns: Performance degrades as the number of documents (K) increases, sometimes performing worse than basic retrievers when dealing with large datasets.

    • +
    • Poor Document Discrimination: Rerankers can be misled by irrelevant documents, sometimes assigning high scores to content with minimal relevance to the query.

    • +
    • Consistency Issues: Performance and relative rankings between different rerankers can vary significantly depending on the number of documents being processed.

    • +
    +
    +
    +

    5.3.3. Will RAGs exist in the future?

    +

    This question is posed as we contrast RAGs with LLMs with long-context windows (LC).

    +

    Recent research has shed light on this specific point [Li et al., 2024], suggesting that, on the one hand, RAGs can be seen as a cost-effective alternative to LC models:

    +
      +
    • RAGs offer lower computational cost compared to LC due to the significantly shorter input length required for processing.

    • +
    • This cost-efficiency arises because RAG reduces the number of input tokens to LLMs, which of course reduces usage cost as pricing is based on the number of input (and output) tokens.

    • +
    +

    On the other hand, this RAG benefit is achieved at the cost of performance:

    +
      +
    • Recent advancements in LLMs, in particular with Gemini-1.5 and GPT-4o models, demonstrate capabilities in understanding long contexts directly, which enables them to outperform RAG in terms of average performance

    • +
    • LC models can process extremely long contexts, such as Gemini 1.5 which can handle up to 1 million tokens, and these models benefit from large-scale pretraining to develop strong long-context capabilities.

    • +
    +

    This cost-performance trade-off is illustrated in Fig. 5.10, where LC models outperform RAGs in terms of average performance while RAGs are more cost-effective.

    +
    +Long-Context LLMs for Superior Performance +
    +

    Fig. 5.10 Long-Context LLMs demonstrate superior performance while RAGs are more cost-effective [Li et al., 2024].

    +
    +
    +

    Fig. 5.10 also shows a model called “SELF-ROUTE” which combines RAG and LC by routing queries based on model self-reflection. This is a hybrid approach that reduces computational costs while maintaining performance comparable to LC. The advantage of SELF-ROUTE is most significant for smaller values of k, where k is the number of retrieved text chunks, and SELF-ROUTE shows a marked improvement in performance over RAG, while as k increases the performance of RAG and SELF-ROUTE approaches that of LC.

    +

    Another example of a hybrid approach that combines the benefits of both LC and RAGs is RetroLLM [Li et al., 2024], which is a unified framework that integrates retrieval and generation into a single process, enabling language models to generate fine-grained evidence directly from a corpus. The key contribution is that this approach delivers those benefits while eliminating the need for a separate retriever, addressing limitations of traditional RAG methods. Experimental results demonstrate RetroLLM’s superior performance compared to traditional RAG methods, across both in-domain and out-of-domain tasks. It also achieves a significant reduction in token consumption due to its fine-grained evidence retrieval.

    +

    A relevant development in this area is the introduction of LOFT [Lee et al., 2024], a benchmark to assess this paradigm shift from RAGs to LCs, using real-world tasks requiring context up to millions of tokens. Evidence suggests LCs can deliver performance with simplified pipelines compared to RAGs, particularly for tasking requiring multi-hop reasoning over long contexts when using Chain-of-Thought [Wei et al., 2023]. However, LCs can still be outperformed by specialized retrievers, in particular Gecko, a specialized model fine-tuned on extensive text retrieval and similarity tasks.

    +

    Bottom-line: Do we really need RAGs? The answer is conditional:

    +
      +
    • RAG may be relevant when cost-effectiveness is a key requirement and where the model needs to access vast amounts of external knowledge without incurring high computational expenses. However, as LLMs context window sizes increase and LLMs cost per input token is decreases, RAG may not be as relevant as it was before.

    • +
    • Long-context LLMs are superior when performance is the primary concern, and the model needs to handle extensive texts that require deep contextual understanding and reasoning.

    • +
    • Hybrid approaches like SELF-ROUTE are valuable as they combine the strengths of RAG and LC offering a practical balance between cost and performance, especially for applications where both factors are critical.

    • +
    +

    Ultimately, the choice between RAG, LC, or a hybrid method depends on the specific requirements of the task, available resources, and the acceptable trade-off between cost and performance.

    +

    In a later case study, we demonstrate the power of LCs as we construct a Quiz generator with citations over a large knowledge base without the use of chunking nor RAGs.

    +
    +
    +
    +

    5.4. A Note on Frameworks

    +

    We have covered a few open source tools for parsing data and provided a canonical RAG pipeline directly using an open source VectorDB together with an LLM. There is a growing number of frameworks that offer similar functionality wrapping the same core concepts at a higher level of abstraction. The two most popular ones are Langchain and LlamaIndex.

    +

    For instance, the code below shows how to use LlamaIndex’s LlamaParse for parsing input documents, which offers support for a wide range of file formats (e.g. .pdf, .pptx, .docx, .xlsx, .html). We we can see that the code is very similar to the one we used for MarkitDown and Docling.

    +
    from llama_parse import LlamaParse
    +
    +# Initialize the parser
    +parser = LlamaParse(
    +    api_key="llx-your-api-key-here",
    +    result_type="markdown",  # Can be "markdown" or "text"
    +    verbose=True
    +)
    +
    +documents = parser.load_data(["./doc1.pdf", "./doc2.pdf"])
    +
    +
    +

    As another example, the code below replicates our ChromaDB-based retrieval system using LlamaIndex [LlamaIndex, 2024].

    +

    As we can see, similar concepts are used in both frameworks:

    +
      +
    • Documents to represent elements of the knowledge base

    • +
    • Collections to store the documents

    • +
    • Indexing of embeddings in the VectorDB and finally

    • +
    • Querying the VectorDB to retrieve the documents

    • +
    +
    import chromadb
    +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
    +from llama_index.vector_stores.chroma import ChromaVectorStore
    +from llama_index.core import StorageContext
    +
    +# load some documents
    +documents = SimpleDirectoryReader("./data").load_data()
    +
    +# initialize client, setting path to save data
    +db = chromadb.PersistentClient(path="./chroma_db")
    +
    +# create collection
    +chroma_collection = db.get_or_create_collection("tamingllms")
    +
    +# assign chroma as the vector_store to the context
    +vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    +storage_context = StorageContext.from_defaults(vector_store=vector_store)
    +
    +# create your index
    +index = VectorStoreIndex.from_documents(
    +    documents, storage_context=storage_context
    +)
    +
    +# create a query engine and query
    +query_engine = index.as_query_engine()
    +response = query_engine.query("Who is the author of Taming LLMs?")
    +print(response)
    +
    +
    +

    Frameworks are useful for quickly prototyping RAG systems and for building applications on top of them as they provide a higher level of abstraction and integration with third-party libraries. However, the underlying concepts are the same as the ones we have covered in this chapter. More often than not, problems arise when developers either do not understand the underlying concepts or fail to understand the details of the implement behind the abstractions provided by the framework. Therefore, it is recommended to try and start your implementation using lower level tools as much as possible and only when (i) the underlying problem as well as (ii) the desired solution are well understood, then consider moving to higher level frameworks if really needed.

    -

    5.4. Case Studies

    -

    This section presents three case studies that demonstrate practical solutions to common LLM limitations:

    -

    First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.

    -

    Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.

    -

    Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality.

    +

    5.5. Case Studies

    +

    This section presents two case studies to complement topics we have covered in this chapter in the context of managing input data for LLMs.

    +

    First, we cover content chunking, in particular Content Chunking with Contextual Linking which showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.

    +

    Second, we build a Quiz generator with citations using long context window. Not all knowledge intense applications require RAGs. In this case study, we show how to use long context window as well as some additional input management techniques such as prompt caching for efficiency and reference management to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality.

    -

    5.4.1. Case Study I: Content Chunking with Contextual Linking

    -

    Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:

    +

    5.5.1. Case Study I: Content Chunking with Contextual Linking

    +

    Content chunking is commonly used to breakdown long-form content into smaller, manageable chunks. In the context of RAGs, this can be helpful not only to help the retrieval system find more contextually relevant documents but also lead to a more cost efficient LLM solution since fewer tokens are processed in the context window. Furthermore, semantic chunking can increase accuracy of RAG systems [ZenML, 2024].

    +

    Content chunking with contextual linking is a chunking technique that seeks to split input content while keeping chunk-specific context, hence allowing the LLM to maintain coherence and context when generating responses per chunks. In that way, this technique tackles two key problems:

    1. The LLM’s inability to process long inputs to do context-size limits

    2. -
    3. The LLM’s inability to generate long-form content due to the max_output_tokens limitation.

    4. The LLM’s inability to maintain coherence and context when generating responses per chunks

    -

    Here, we exemplify this technique by following these steps:

    +

    As a consequence, a third problem is also tackled: LLM’s inability to generate long-form content due to the max_output_tokens limitation. Since we generate responses per chunk, as we will see later, we end up with a solution that is capable of generating long-form content while maintaining coherence.

    +

    We exemplify this technique by following these steps:

    1. Chunking the Content: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.

    2. Maintaining Context: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.

    3. @@ -1425,7 +2040,7 @@

      -

      5.4.1.1. Generating long-form content

      +

      5.5.1.1. Generating long-form content

      • Goal: Generate a long-form report analyzing a company’s financial statement.

      • Input: A company’s 10K SEC filing.

      • @@ -1433,10 +2048,10 @@

        Content Chunking with Contextual Linking
        -

        Fig. 5.6 Content Chunking with Contextual Linking Schematic Representation.

        +

        Fig. 5.11 Content Chunking with Contextual Linking Schematic Representation.

        -

        The diagram in Fig. 5.6 illustrates the process we will follow for handling long-form content generation with Large Language Models through “Content Chunking with Contextual Linking.” It shows how input content is first split into manageable chunks using a chunking function (e.g. CharacterTextSplitter with tiktoken tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.

        +

        The diagram in Fig. 5.11 illustrates the process we will follow for handling long-form content generation with Large Language Models through “Content Chunking with Contextual Linking.” It shows how input content is first split into manageable chunks using a chunking function (e.g. CharacterTextSplitter with tiktoken tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.

        Step 1: Chunking the Content

        There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:

          @@ -1447,7 +2062,7 @@

          langchain for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters [LangChain, 2024] such as JSON-, Markdown- and HTML-based or split by token. We will use the CharacterTextSplitter with tiktoken as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.

          +

          Here, we will utilize langchain for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters [LangChain, 2024] such as JSON-, Markdown- and HTML-based or split by token. We will use the CharacterTextSplitter with tiktoken as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.

        @@ -1737,7 +2352,7 @@

        -

        5.4.1.2. Discussion

        +

        5.5.1.2. Discussion

        Results from the generated report present a few interesting aspects:

        • Coherence: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.’s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.

        • @@ -1748,28 +2363,25 @@

          Anthropic, 2024a]. The approach, as shown in Fig. 5.7, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.

          +

          Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic’s Contextual Retrieval [Anthropic, 2024a]. The approach, as shown in Fig. 5.12, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.

          Anthropic Contextual Linking
          -

          Fig. 5.7 Anthropic Contextual Linking [Anthropic, 2024a].

          +

          Fig. 5.12 Anthropic Contextual Linking [Anthropic, 2024a].

    -
    -

    5.4.2. Case Study II: Github RAG

    -
    -
    -

    5.4.3. Case Study III: Quiz Generation with Citations

    +
    +

    5.5.2. Case Study II: Quiz Generation with Citations

    In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.

    -

    5.4.3.1. Use Case

    -

    Let’s assume you are a Harvard student enrolled in GOV 1039 “The Birth of Modern Democracy” (see Fig. 5.8), you face a daunting reading list for next Tuesday’s class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.

    +

    5.5.2.1. Use Case

    +

    Let’s assume you are a Harvard student enrolled in GOV 1039 “The Birth of Modern Democracy” (see Fig. 5.13), you face a daunting reading list for next Tuesday’s class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.

    Harvard Class
    -

    Fig. 5.8 Harvard’s Democratic Theory Class

    +

    Fig. 5.13 Harvard’s Democratic Theory Class

    Instead of trudging through these dense historical texts sequentially, we would like to:

    @@ -1780,7 +2392,7 @@

    -

    5.4.3.2. Implementation

    +

    5.5.2.2. Implementation

    The full implementation is available at Book’s Github repository. Here, we will cover the most relevant parts of the implementation.

    Client Class

    First, we will define the Client class which will provide the key interface users will interact with. It has the following summarized interface:

    @@ -1810,12 +2422,12 @@

    add() method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the “Corpus-in-Context” (CIC) Prompting [Lee et al., 2024].

    -

    Fig. 5.9 shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.

    +

    The add() method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the “Corpus-in-Context” (CIC) Prompting [Lee et al., 2024].

    +

    Fig. 5.14 shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.

    CIC Format
    -

    Fig. 5.9 Example of Corpus-in-Context Prompting for retrieval.

    +

    Fig. 5.14 Example of Corpus-in-Context Prompting for retrieval.

    CiC prompting leverages LLM’s capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context.

    @@ -1911,7 +2523,7 @@

    {citations} instructs the model to add CiC citations to the response if user requests it.

    -

    5.4.3.3. Example Usage

    +

    5.5.2.3. Example Usage

    Dataset

    First, we will define our knowledge base.

      @@ -1979,16 +2591,16 @@

      Fig. 5.10 shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.

      +

      Fig. 5.15 shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.

      Quiz with Citations
      -

      Fig. 5.10 Sample Quiz with Citations.

      +

      Fig. 5.15 Sample Quiz with Citations.

    -
    -

    5.4.3.4. Discussion

    +
    +

    5.5.2.4. Discussion

    The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.

    However, several limitations emerged during this process:

      @@ -2001,7 +2613,10 @@

      -

      5.5. Conclusion

      +

      5.6. Conclusion

      +

      This chapter has explored critical strategies and techniques for managing input data in LLM applications, focusing on three key areas: data parsing, retrieval augmentation, and practical implementation patterns. We examined how parsing tools like MarkItDown and Docling can transform diverse data formats into LLM-compatible representations, demonstrating through case studies how parser quality can impact LLM performance. The chapter also investigated retrieval augmentation techniques, particularly RAG systems, showing how they can enhance LLM capabilities by providing access to external knowledge while discussing their future relevance in the context of emerging long-context language models.

      +

      Through our case studies, we demonstrated practical approaches to handling common challenges in LLM applications. The Content Chunking with Contextual Linking case study illustrated techniques for managing long-form content generation while maintaining coherence across chunks. The Quiz Generation with Citations case study showcased how long-context windows can be effectively utilized without the need for complex retrieval systems, highlighting the importance of choosing the right approach based on specific application requirements rather than defaulting to more complex solutions.

      +

      As the field continues to evolve, the choice between traditional RAG systems and emerging long-context models will likely become increasingly nuanced. While RAGs offer cost-effective solutions for incorporating external knowledge, the rise of long-context models suggests a future where simpler architectures might suffice for many applications. The key insight is that effective input data management requires careful consideration of trade-offs among complexity, cost, and performance, always guided by specific application requirements rather than following a one-size-fits-all approach. Success in building robust LLM applications will depend on understanding these trade-offs and selecting appropriate strategies for each use case.

      CC BY-NC-SA 4.0

      @misc{tharsistpsouza2024tamingllms,
         author = {Tharsis T. P. Souza},
      @@ -2015,52 +2630,168 @@ 

      -

      5.6. References

      -
      -
      +

      5.7. References

      +
      +
      +[AG24] +

      Jay Alammar and Maarten Grootendorst. Hands-On Large Language Models. O'Reilly, 2024. ISBN 978-1098150969. URL: https://www.oreilly.com/library/view/hands-on-large-language/9781098150952/.

      +
      +
      [AWP+24]

      Alfonso Amayuelas, Kyle Wong, Liangming Pan, Wenhu Chen, and William Yang Wang. Knowledge of knowledge: exploring known-unknowns uncertainty with large language models. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 6416–6432. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.383, doi:10.18653/v1/2024.findings-acl.383.

      -
      -[KSR24] +
      +[BCV14] +

      Yoshua Bengio, Aaron Courville, and Pascal Vincent. Representation learning: a review and new perspectives. 2014. URL: https://arxiv.org/abs/1206.5538, arXiv:1206.5538.

      +
      +
      +[Dia24] +

      Nir Diamant. Rag techniques. GitHub Repository, 2024. Collection of advanced RAG techniques and implementation patterns. URL: https://github.com/NirDiamant/RAG_Techniques.

      +
      +
      +[HRK+24] +(1,2) +

      Jia He, Mukund Rungta, David Koleczek, Arshdeep Sekhon, Franklin X Wang, and Sadid Hasan. Does prompt formatting have any impact on llm performance? 2024. URL: https://arxiv.org/abs/2411.10541, arXiv:2411.10541.

      +
      +
      +[JLZ+24] +

      Mathew Jacob, Erik Lindgren, Matei Zaharia, Michael Carbin, Omar Khattab, and Andrew Drozdov. Drowning in documents: consequences of scaling reranker inference. 2024. URL: https://arxiv.org/abs/2411.11767, arXiv:2411.11767.

      +
      +
      +[Kim24] +

      Abhinav Kimothi. A Simple Guide to Retrieval Augmented Generation. Manning Publications, 2024. ISBN 9781633435858. Manning Early Access Program (MEAP). URL: https://www.manning.com/books/a-simple-guide-to-retrieval-augmented-generation.

      +
      +
      +[KSR24]

      Suhas Kotha, Jacob Mitchell Springer, and Aditi Raghunathan. Understanding catastrophic forgetting in language models via implicit inference. In The Twelfth International Conference on Learning Representations. 2024. URL: https://openreview.net/forum?id=VrHiF2hsrm.

      -
      +
      [LCD+24] -(1,2) +(1,2,3)

      Jinhyuk Lee, Anthony Chen, Zhuyun Dai, Dheeru Dua, Devendra Singh Sachan, Michael Boratko, Yi Luan, Sébastien M. R. Arnold, Vincent Perot, Siddharth Dalmia, Hexiang Hu, Xudong Lin, Panupong Pasupat, Aida Amini, Jeremy R. Cole, Sebastian Riedel, Iftekhar Naim, Ming-Wei Chang, and Kelvin Guu. Can long-context language models subsume retrieval, rag, sql, and more? 2024. URL: https://arxiv.org/abs/2406.13121, arXiv:2406.13121.

      -
      -[LPP+21] +
      +[LPP+21]

      Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, and Douwe Kiela. Retrieval-augmented generation for knowledge-intensive nlp tasks. 2021. URL: https://arxiv.org/abs/2005.11401, arXiv:2005.11401.

      -
      -[NBGC24] +
      +[LJZ+24] +

      Xiaoxi Li, Jiajie Jin, Yujia Zhou, Yongkang Wu, Zhonghua Li, Qi Ye, and Zhicheng Dou. Retrollm: empowering large language models to retrieve fine-grained evidence within generation. 2024. URL: https://arxiv.org/abs/2412.11919, arXiv:2412.11919.

      +
      +
      +[LLZ+24] +(1,2) +

      Zhuowan Li, Cheng Li, Mingyang Zhang, Qiaozhu Mei, and Michael Bendersky. Retrieval augmented generation or long-context llms? a comprehensive study and hybrid approach. 2024. URL: https://arxiv.org/abs/2407.16833, arXiv:2407.16833.

      +
      +
      +[LFC+24] +(1,2) +

      Kai Liu, Zhihang Fu, Chao Chen, Wei Zhang, Rongxin Jiang, Fan Zhou, Yaowu Chen, Yue Wu, and Jieping Ye. Enhancing llm's cognition via structurization. 2024. URL: https://arxiv.org/abs/2407.16434, arXiv:2407.16434.

      +
      +
      +[Lla24] +

      LlamaIndex. Llamaparse: extract structured data from text and pdfs using llms. 2024. LlamaParse. URL: https://github.com/run-llama/llama_parse.

      +
      +
      +[NBGC24]

      Shiyu Ni, Keping Bi, Jiafeng Guo, and Xueqi Cheng. When do LLMs need retrieval augmentation? mitigating LLMs' overconfidence helps retrieval augmentation. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 11375–11388. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.675, doi:10.18653/v1/2024.findings-acl.675.

      -
      -[TDW+24] +
      +[TDW+24] +(1,2)

      Jiejun Tan, Zhicheng Dou, Wen Wang, Mang Wang, Weipeng Chen, and Ji-Rong Wen. Htmlrag: html is better than plain text for modeling retrieved knowledge in rag systems. 2024. URL: https://arxiv.org/abs/2411.02959, arXiv:2411.02959.

      -
      -[ZLJ+24] +
      +[WWS+23] +

      Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. Chain-of-thought prompting elicits reasoning in large language models. 2023. URL: https://arxiv.org/abs/2201.11903, arXiv:2201.11903.

      +
      +
      +[WIP+24] +

      Yunshu Wu, Hayate Iso, Pouya Pezeshkpour, Nikita Bhutani, and Estevam Hruschka. Less is more for long document summary evaluation by llms. 2024. URL: https://arxiv.org/abs/2309.07382, arXiv:2309.07382.

      +
      +
      +[ZLJ+24]

      Yujia Zhou, Zheng Liu, Jiajie Jin, Jian-Yun Nie, and Zhicheng Dou. Metacognitive retrieval-augmented large language models. In Proceedings of the ACM Web Conference 2024, WWW '24, 1453–1463. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3589334.3645481, doi:10.1145/3589334.3645481.

      -
      +
      [Anthropic4a] -(1,2) +(1,2)

      Anthropic. Introducing contextual retrieval. 09 2024a. URL: https://www.anthropic.com/news/contextual-retrieval.

      -
      -[LangChain24] +
      +[AthinaAI24] +

      AthinaAI. Rag cookbooks. GitHub Repository, 2024. Collection of recipes and best practices for building RAG applications. URL: https://github.com/athina-ai/rag-cookbooks.

      +
      +
      +[ChromaDB4a] +

      ChromaDB. Chromadb cookbook: hnsw configuration. Website, 2024a. URL: https://cookbook.chromadb.dev/core/configuration/#hnsw-configuration.

      +
      +
      +[ChromaDB4b] +

      ChromaDB. Chromadb documentation. Website, 2024b. URL: https://docs.trychroma.com/.

      +
      +
      +[HuggingFace4f] +

      HuggingFace. Sentence transformers. Website, 2024f. URL: https://huggingface.co/sentence-transformers.

      +
      +
      +[HuggingFace4i] +

      HuggingFace. Massive text embedding benchmark (mteb) leaderboard. Website, 2024i. URL: https://huggingface.co/spaces/mteb/leaderboard.

      +
      +
      +[IBMResearch24] +

      IBM Research. Docling: a document-level linguistic annotation framework. GitHub Repository, 2024. Framework for document-level linguistic annotation and analysis. URL: https://github.com/DS4SD/docling.

      +
      +
      +[LangChain24]

      LangChain. Text splitters - langchain documentation. https://python.langchain.com/docs/how_to/#text-splitters, 2024. Accessed: 12/07/2024.

      -
      -[MerrillLynch24] +
      +[LlamaIndex24] +

      LlamaIndex. Storing - llamaindex documentation. Website, 2024. URL: https://docs.llamaindex.ai/en/stable/understanding/storing/storing/.

      +
      +
      +[MendableAI24] +

      Mendable AI. Firecrawl: a fast and efficient web crawler for llm training data. GitHub Repository, 2024. High-performance web crawler optimized for collecting LLM training data. URL: https://github.com/mendableai/firecrawl.

      +
      +
      +[MerrillLynch24] +(1,2)

      Merrill Lynch. Chief investment officer capital market outlook. CIO Weekly Letter, 2024. URL: https://olui2.fs.ml.com/publish/content/application/pdf/gwmol/me-cio-weekly-letter.pdf.

      +
      +[Microsoft24] +

      Microsoft. Markitdown: structured generation with large language models. GitHub Repository, 2024. Framework for structured text generation using LLMs. URL: https://github.com/microsoft/markitdown.

      +
      +[OpenAI24] +

      OpenAI. What are embeddings? Website, 2024. URL: https://platform.openai.com/docs/guides/embeddings/what-are-embeddings.

      +
      +[Ragas24] +

      Ragas. Rag evaluation - ragas documentation. Website, 2024. URL: https://docs.ragas.io/en/stable/getstarted/rag_evaluation/.

      +
      +
      +[Unstructuredio24] +

      Unstructured.io. Unstructured: open source libraries for pre-processing documents. GitHub Repository, 2024. URL: https://github.com/Unstructured-IO/unstructured.

      +
      +
      +[ZenML24] +

      ZenML. Scaling rag accuracy from 49% to 86% in finance q&a assistant. Website, 2024. URL: https://www.zenml.io/llmops-database/scaling-rag-accuracy-from-49-to-86-in-finance-q-a-assistant.

      +
      +
      +
      +
      +

    diff --git a/tamingllms/_build/html/notebooks/local.html b/tamingllms/_build/html/notebooks/local.html index 173e81a..b1a01f9 100644 --- a/tamingllms/_build/html/notebooks/local.html +++ b/tamingllms/_build/html/notebooks/local.html @@ -252,7 +252,7 @@
    -

    8. Local LLMs in Practice

    +

    8. Local LLMs in Practice

    Freedom is something that dies unless it’s used.

    —Hunter S. Thompson

    @@ -260,55 +260,55 @@
    -

    8.1. Introduction

    +

    8.1. Introduction

    Running Open Source LLMs locally versus depending on proprietary cloud-based models represents more than just a technical choice - it’s a fundamental re-imagining of how we interact with AI technology, putting control back in the hands of users.

    Privacy concerns are a key driver for running LLMs locally. Individual users may want to process personal documents, photos, emails, and chat messages without sharing sensitive data with third parties. For enterprise use cases, organizations handling medical records must comply with HIPAA regulations that require data to remain on-premise. Similarly, businesses processing confidential documents and intellectual property, as well as organizations subject to GDPR and other privacy regulations, need to maintain strict control over their data processing pipeline.

    Cost considerations are another key driver. Organizations and individual consumers can better control expenses by matching model capabilities to their specific needs rather than paying for multiple cloud API subscriptions. For organizations with high-volume applications, this customization and control over costs becomes especially valuable compared to the often prohibitive per-request pricing of cloud solutions. For consumers, running multiple open source models locally eliminates the need to maintain separate subscriptions to access different model capabilities.

    @@ -318,11 +318,11 @@

    -

    8.2. Choosing your Model

    +

    8.2. Choosing your Model

    The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness.

    It is important to observe long-term strategic considerations when choosing a model. These entails prioritization dimensions that may enable competitive advantage in the long-term, including:

      -
    1. Managed Services Support: You may start experimenting locally with LLMs but eventually you will need to deployment options: either host models yourself or consider managed services. Cloud providers like AWS Bedrock, SambaNova and Together.ai can simplify deployment and management but model family support varies along with varying SLAs for model availability, support and model serving [Analysis, 2024]. One should evaluate the availability of managed services for your target model family.

    2. +
    3. Managed Services Support: You may start experimenting locally with LLMs but eventually you will need to deployment options: either host models yourself or consider managed services. Cloud providers like AWS Bedrock, SambaNova and Together.ai can simplify deployment and management but model family support varies along with varying SLAs for model availability, support and model serving [Analysis, 2024]. One should evaluate the availability of managed services for your target model family.

    4. Vendor Long-Term Viability: Consider vendor’s long-term strategy and transparency around future development. Evaluate factors like funding, market position, and development velocity to assess whether the vendor will remain a reliable partner. Further, transparency around long-term strategy and roadmap is a critical consideration when choosing a model vendor partner.

    5. Single-Provider Lock-in: Users and organizations should avoid the risk of lock-in by remaining flexible with your choice of LLM providers. Today’s winning models are not guaranteed to be the same in the future.

    6. Time-to-market and Customization: As the same models are available to everyone, base capabilities are becoming commoditized. As a consequence, competitive advantage comes from the application layer. Hence, the ability to iterate fast while customizing to your specific domain becomes a critical strategic consideration when choosing a model.

    7. @@ -330,7 +330,7 @@

      -

      8.2.1. Task Suitability

      +

      8.2.1. Task Suitability

      When evaluating an open source LLM, task suitability is a critical first consideration. A model that performs well on general benchmarks may struggle with specific domain tasks. Understanding the intended use case helps narrow down model options based on their demonstrated strengths.

      Task Categories

      When determining which LLM task to prioritize, carefully consider your specific use case and end-user needs. Different applications require distinct model capabilities and optimizations. Common LLM Task Categories include:

      @@ -344,11 +344,11 @@

      Fig. 8.1 shows the number models per task category available at Hugging Face as of December 22, 2024 [Face, 2024t]. Text generation is by far the most popular task category.

      +

      Fig. 8.1 shows the number models per task category available at Hugging Face as of December 22, 2024 [HuggingFace, 2024t]. Text generation is by far the most popular task category.

      Task Number
      -

      Fig. 8.1 Number of models per task category from Hugging Face as of December 22, 2024 [Face, 2024t].

      +

      Fig. 8.1 Number of models per task category from Hugging Face as of December 22, 2024 [HuggingFace, 2024t].

      Model Types

      @@ -364,8 +364,8 @@

      Fig. 8.2 Model Types.

      -

      The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

      -

      Benchmark results [Meta AI, 2024c] in Table 8.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

      +

      The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

      +

      Benchmark results [Meta AI, 2024c] in Table 8.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

    Table 3.6 Comparison of Lighteval, LangSmith, and Promptfoo
    @@ -408,7 +408,7 @@

    [Hui et al., 2024] is an example of a purpose-built model that demonstrates significant performance on the specific task of code generation.

    +

    While Llama family of models exhibits strong performance across general knowledge, instruction following, and specialized domains, purpose-built models may still outperform it in highly specific applications. Qwen/Qwen2.5-Coder-32B-Instruct [Hui et al., 2024] is an example of a purpose-built model that demonstrates significant performance on the specific task of code generation.

    Model Features

    Model features can either enable or limit the feasibility of specific use cases. Understanding features of your candidate models is crucial for determining whether a model is suitable for your application. For example:

      @@ -420,9 +420,9 @@

      -

      8.2.2. Performance & Cost

      +

      8.2.2. Performance & Cost

      General benchmarks are useful for comparing models across different standard tasks. Open Source models are becoming more competitive with proprietary models with LLama, Qwen, DeepSeek and Mistral model families being some of the most powerful open source models available today.

      -

      Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 8.3.

      +

      Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 8.3.

      Qwen Performance
      @@ -436,7 +436,7 @@

      Fig. 8.4 Performance Comparison including proprietary models.

      -

      Also from China, DeepSeek-V3 [DeepSeek, 2024] represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in Fig. 8.5. The model demonstrates impressive cost efficiency metrics (see Fig. 8.6), processing input tokens at \(0.27 per million and output tokens at \)1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).

      +

      Also from China, DeepSeek-V3 [DeepSeek, 2024] represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in Fig. 8.5. The model demonstrates impressive cost efficiency metrics (see Fig. 8.6), processing input tokens at \(0.27 per million and output tokens at \)1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).

      What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model’s release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.

      DeepSeek-V3 @@ -469,7 +469,7 @@

      Fig. 8.7 shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math & HumanEval benchmarks [Analysis, 2024]. Price is a blend of Cost Per Input Token plus Input & Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers [Analysis, 2024] supporting these models.

      +

      Fig. 8.7 shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math & HumanEval benchmarks [Analysis, 2024]. Price is a blend of Cost Per Input Token plus Input & Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers [Analysis, 2024] supporting these models.

      Performance Comparison including proprietary models.
      @@ -502,7 +502,7 @@

      -

      8.2.3. Licensing

      +

      8.2.3. Licensing

      When evaluating open-source LLMs, it’s important to consider licensing and data usage policies. Some models may require attribution or commercial use licenses, while others may be more permissive. Additionally, ensure that the model’s training data is compatible with your intended use case and complies with relevant data protection laws.

      The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. Table 8.2 provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:

        @@ -557,30 +557,30 @@

        Review, 2024] serves as a pivotal example, where the Times claims its copyrighted materials were used without authorization to train language models. This litigation has far-reaching consequences for developers building LLM-powered applications. Should courts rule in favor of copyright holders, model providers may need to withdraw and retrain models containing protected content. These legal uncertainties introduce substantial complexity into LLM implementation strategies, demanding careful consideration during project planning phases.

        -

        Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach [Qwen et al., 2024] illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.

        +

        The legal landscape surrounding LLM training data has grown increasingly complex, particularly regarding copyright infringement concerns. The high-profile lawsuit between OpenAI and The New York Times [Review, 2024] serves as a pivotal example, where the Times claims its copyrighted materials were used without authorization to train language models. This litigation has far-reaching consequences for developers building LLM-powered applications. Should courts rule in favor of copyright holders, model providers may need to withdraw and retrain models containing protected content. These legal uncertainties introduce substantial complexity into LLM implementation strategies, demanding careful consideration during project planning phases.

        +

        Recent LLM releases demonstrate varying levels of data transparency. For instance, Qwen2.5’s approach [Qwen et al., 2024] illustrates common industry practices in both its achievements and limitations. On the training data scale front, Qwen2.5 does provide some transparency by discussing some training data methodology compared to previous versions such as expanding from 7 trillion to 18 trillion tokens, while implementing sophisticated quality filtering and carefully balancing domain representation through sampling adjustments.

        However, like many commercial LLMs, Qwen2.5 exhibits transparency limitations. The report provides incomplete disclosure of data sources and limited information about the proportions of different data types used in training. The preprocessing methodologies remain unclear, and there is minimal discussion of potential biases that may exist in the training data.

        -

        Similarly, in the Llama 3 paper [AI, 2024c], Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.

        +

        Similarly, in the Llama 3 paper [AI, 2024c], Meta AI does share some details about the pre-training corpus stating simply stating that it was around 15T multilingual tokens, compared to 1.8T tokens for Llama 2. The exact sources of data used for pre-training and post-training are not explicitly listed.

        These gaps in transparency reflect a broader industry challenge in balancing commercial interests with the need for openness and scientific reproducibility.

        -

        A significant advancement in open-source language model training data is HuggingFace’s release of the FineWeb datasets. In its first release [Penedo et al., 2024], FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.

        +

        A significant advancement in open-source language model training data is HuggingFace’s release of the FineWeb datasets. In its first release [Penedo et al., 2024], FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.

        -

        8.2.4. Community Support

        +

        8.2.4. Community Support

        Community support plays a vital role in the open-source LLM ecosystem. Active communities contribute to model development, provide technical assistance, and share valuable resources. When evaluating open-source LLMs, the strength and engagement of the community should be a key consideration, as it directly impacts the model’s long-term viability and practical utility.

        The popularity of different model families reflects their community adoption. In 2024, the Qwen and Llama families have emerged as clear favorites, with Qwen2.5-1.5B-Instruct alone representing 35% of total open source models downloads in 2024.

        Hugging Face Downloads
        -

        Fig. 8.10 Hugging Face Model Downloads in 2024 as of December 22 of the same year [Face, 2024t].

        +

        Fig. 8.10 Hugging Face Model Downloads in 2024 as of December 22 of the same year [HuggingFace, 2024t].

        -

        Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations [Qwen, 2024b].

        +

        Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations [Qwen, 2024b].

        -

        8.2.5. Customization

        +

        8.2.5. Customization

        Model customization is an important consideration when selecting an open-source LLM. Adapting and fine-tuning to specific use cases can significantly impact practical utility and performance in production environments.

        Model providers increasingly offer streamlined fine-tuning services. For example, Mistral demonstrates an accessible approach to model customization. -The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset [Face, 2024u]. This API design makes it easy to experiment with model customization while maintaining control over the training process.

        +The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset [HuggingFace, 2024u]. This API design makes it easy to experiment with model customization while maintaining control over the training process.

        # create a fine-tuning job
         created_jobs = client.fine_tuning.jobs.create(
             model="open-mistral-7b", 
        @@ -599,7 +599,7 @@ 

        created_jobs

        -

        For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [Face, 2024d]:

        +

        For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [HuggingFace, 2024d]:

        • Supervised Fine-Tuning (SFT)

        • Reward Modeling (RM)

        • @@ -607,7 +607,7 @@

          Case Study: Aligning a Language Model to a Policy, we will explore how to use TRL to fine-tune a model to align with user preferences.

          -

          Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. Table 8.3 shows as an example the cost of associated with fine-tuning Mistral models [AI, 2024a].

          +

          Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. Table 8.3 shows as an example the cost of associated with fine-tuning Mistral models [AI, 2024a].

    Table 8.1 Benchmark results for Llama 2 family of models.
    @@ -645,7 +645,7 @@

    [Face, 2024v, Zhao et al., 2024]. A noteworthy example is Hugging Face’s SmolLM2 [Allal et al., 2024], a family of compact language models designed with several key advantages:

    +

    Small language models can serve as a lightweight alternative to customization compared to large models. Recent research has shown that smaller models can achieve competitive performance compared to larger models [HuggingFace, 2024v, Zhao et al., 2024]. A noteworthy example is Hugging Face’s SmolLM2 [Allal et al., 2024], a family of compact language models designed with several key advantages:

    1. Compact Sizes:

    @@ -675,10 +675,10 @@

    -

    8.3. Tools for Local LLM Deployment

    +

    8.3. Tools for Local LLM Deployment

    Local LLM deployment tools generally fall into two categories: inference-focused tools that prioritize performance and programmability for technical users requiring production-grade deployments, and user interface (UI) tools that emphasize accessibility through graphical interfaces for non-technical users, trading some performance for ease of use and broader adoption. In the following sections we will explore some of these tools discussing their features, capabilities, and trade-offs.

    -

    8.3.1. Serving Models

    +

    8.3.1. Serving Models

    Serving an LLM model involves making it available for inference by setting up infrastructure to process requests and manage resources efficiently. This serving layer handles several key responsibilities, from loading model weights and managing compute resources to processing requests and optimizing performance. Let’s examine the core components of model serving:

    1. Model Loading and Initialization

    2. @@ -731,10 +731,10 @@

      -

      8.3.1.1. LLama.cpp

      -

      LLama.cpp [Gerganov and contributors, 2024a] is an MIT-licensed open source optimized implementation of the LLama model architecture designed to run efficiently on machines with limited memory.

      +

      8.3.1.1. LLama.cpp

      +

      LLama.cpp [Gerganov and contributors, 2024a] is an MIT-licensed open source optimized implementation of the LLama model architecture designed to run efficiently on machines with limited memory.

      Originally developed by Georgi Gerganov and today counting with hundreds of contributors, this C/C++ LLama version provides a simplified interface and advanced features that allow language models to run locally without overwhelming systems. With the ability to run in resource-constrained environments, LLama.cpp makes powerful language models more accessible and practical for a variety of applications.

      -

      In its “Manifesto” [Gerganov and others, 2023], the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:

      +

      In its “Manifesto” [Gerganov and others, 2023], the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:

      • “Will remain open-source”

      • Focuses on simplicity and efficiency in codebase

      • @@ -749,7 +749,7 @@

        [Gerganov and contributors, 2024b] is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include [IBM Think, 2024]:

        +

        GGUF (GPT-Generated Unified Format) [Gerganov and contributors, 2024b] is the latest model format used by LLama.cpp, replacing the older GGML format. It was designed specifically for efficient inference of large language models on consumer hardware. The key features that make GGUF particularly valuable include [IBM Think, 2024]:

        • Improved quantization: GGUF supports multiple quantization levels to reduce model size while preserving performance. Common quantization schemes that are supported by GGUF include:

            @@ -763,9 +763,9 @@

            [Hugging Face, 2024x] and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.

            +

            These capabilities make GGUF models significantly more practical for running LLMs locally compared to full-precision formats, often dramatically reducing memory requirements. Hugging Face hosts a growing collection of pre-converted GGUF models [HuggingFace, 2024x] and provides a tool (ggml-org/gguf-my-repo) to convert existing models to GGUF format, making it easier for developers to access and deploy optimized versions of popular language models.

            Setup

            -

            Please follow the instructions from the LLama.cpp GitHub repository [Gerganov and contributors, 2024a] to install and compile the library.

            +

            Please follow the instructions from the LLama.cpp GitHub repository [Gerganov and contributors, 2024a] to install and compile the library.

            Here, we will compile the library from source on a Linux machine with 8 jobs in parallel for enhanced performance (add the -j argument to run multiple jobs in parallel).

            sudo apt install cmake
             
            @@ -773,7 +773,7 @@ 

            --build build --config Release -j 8

            -

            Python bindings are available through llama-cpp-python package [Betlen and contributors, 2024].

            +

            Python bindings are available through llama-cpp-python package [Betlen and contributors, 2024].

            pip install llama-cpp-python
             
            @@ -864,14 +864,14 @@

            [Gerganov and contributors, 2024] to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter Structured Output for more details.

            +

            It is worth noting Llama.cpp provides a way to use grammars [Gerganov and contributors, 2024] to constrain the output of the model as demonstrated below. This is the same technique Ollama uses, a similar approach to Outlines’ to generate structured outputs from LLMs. See Chapter Structured Output for more details.

            ./build/bin/llama-cli -m ./models/qwen2.5-0.5b-instruct-q8_0.gguf --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
             
             # {"appointmentTime": "8pm", "appointmentDetails": "schedule a a call"}
             

            Python

            -

            A handy Python binding [Betlen and contributors, 2024] is available for LLama.cpp, which by default returns chat completions in OpenAI’s API chat format as below. The package is very comprehensive supporting JSON Mode, function calling, multi-modal models and more.

            +

            A handy Python binding [Betlen and contributors, 2024] is available for LLama.cpp, which by default returns chat completions in OpenAI’s API chat format as below. The package is very comprehensive supporting JSON Mode, function calling, multi-modal models and more.

            MODEL_PATH = "./models/qwen2.5-0.5b-instruct-q8_0.gguf"
            @@ -926,8 +926,8 @@ 

            -

            8.3.1.2. Llamafile

            -

            Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile [Mozilla Ocho, 2024] is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with Cosmopolitan Libc, a universal C standard library that allows creating portable executables compatible with multiple operating systems.

            +

            8.3.1.2. Llamafile

            +

            Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile [Mozilla Ocho, 2024] is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with Cosmopolitan Libc, a universal C standard library that allows creating portable executables compatible with multiple operating systems.

            In this way, Llamafile reduces all the complexity of LLMs to a single executable file (called a “llamafile”) that runs locally without installation. Key advantages of Llamafile over plain Llama.cpp include:

            1. Zero Installation/Configuration

            2. @@ -951,7 +951,7 @@

              [Hugging Face, 2024x]. All you need to do is:

              +

              A large collection of Llamafiles can be found on HuggingFace [HuggingFace, 2024x]. All you need to do is:

              1. Download a llamafile from HuggingFace

              2. Make the file executable

              3. @@ -971,7 +971,7 @@

                http://localhost:8080. And we can use it as demonstrated in the previous section.

    -

    8.3.1.3. Ollama

    +

    8.3.1.3. Ollama

    Ollama is a lightweight, MIT-licensed open-source tool for running LLMs locally. It provides a simple interface for interacting with a wide range of language models, including popular models like Llama 3.1 and Llama 3.2. Ollama is designed to be easy to install and use, making it a popular choice for developers who want to run LLMs locally without the need for extensive setup or configuration. Ollama’s key advantages include:

    1. Model Management

    2. @@ -1065,7 +1065,7 @@

      -

      8.3.1.4. Comparison

      +

      8.3.1.4. Comparison

      Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in Table 8.4.

    Table 8.3 Mistral fine-tuning costs as of December 22, 2024.
    @@ -1121,11 +1121,11 @@

    -

    8.3.2. UI

    +

    8.3.2. UI

    There is a growing number of UI tools for local LLM deployment that aim at providing a more user-friendly experience. Ranging from closed-source to open-source solutions across a range of features and capabilities. We will discuss LM Studio, Jan, and OpenWebUI.

    -

    8.3.2.1. LM Studio

    -

    LM Studio [LM Studio, 2024] is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:

    +

    8.3.2.1. LM Studio

    +

    LM Studio [LM Studio, 2024] is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:

    • Model Parameter Customization: Allows adjusting temperature, maximum tokens, frequency penalty, and other settings

    • Chat History: Enables saving prompts for later use

    • @@ -1148,7 +1148,7 @@

      8.3.2.2. Jan

      +

      8.3.2.2. Jan

      Jan is an open source ChatGPT-alternative that runs local models. Its model’s library contains popular LLMs like Llama, Gemma, Mistral, or Qwen. Key Features of Jan include:

      1. User-Friendly Interface: Run AI models with just a few clicks

      2. @@ -1166,7 +1166,7 @@

        -

        8.3.2.3. Open WebUI

        +

        8.3.2.3. Open WebUI

        Open WebUI is an open-source web interface designed to enhance the local AI model experience, particularly for Ollama and OpenAI-compatible APIs. It aims to provide enterprise-grade features while maintaining user-friendliness. OpenWebUI’s core features include:

        1. Advanced User Interface

          @@ -1206,7 +1206,7 @@

          -

          8.3.2.4. Comparison

          +

          8.3.2.4. Comparison

          LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in Table 8.5.

    Table 8.4 lama.cpp vs Ollama vs Llamafile Comparison
    @@ -1274,8 +1274,8 @@

    -

    8.4. Case Study: The Effect of Quantization on LLM Performance

    -

    This case study examines how different quantization [Face, 2024s] levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.

    +

    8.4. Case Study: The Effect of Quantization on LLM Performance

    +

    This case study examines how different quantization [HuggingFace, 2024s] levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.

    Using the Qwen 2.5 0.5B model as our baseline, we’ll compare four variants:

    • The base fp16 model (no quantization)

    • @@ -1301,8 +1301,8 @@

      -

      8.4.1. Prompts Dataset

      -

      To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 [Salesforce, 2024], which contains Wikipedia excerpts.

      +

      8.4.1. Prompts Dataset

      +

      To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 [Salesforce, 2024], which contains Wikipedia excerpts.

      In our experiments, we will use a total of NUM_PROMPTS prompts that vary in length from MIN_PROMPT_LENGTH to MAX_PROMPT_LENGTH tokens. Using a fixed set of prompts ensures consistent evaluation across model variants and enables direct comparison of metrics like perplexity and throughput.

      @@ -1365,12 +1365,12 @@

      -

      8.4.2. Quantization

      +

      8.4.2. Quantization

      We can quantize a model using the llama-quantize CLI. For instance, to quantize the Qwen 2.5 0.5B model to Q4_K, we can run the following command:

      ./llama-quantize -m ./models/qwen2.5-0.5b-instruct-fp16.gguf ./models/qwen2.5-0.5b-instruct-q8_0.gguf Q4_K
       
      -

      Table 8.6 describes the key quantization levels used in this study [Hugging Face, 2024w], where:

      +

      Table 8.6 describes the key quantization levels used in this study [HuggingFace, 2024w], where:

      • q is the quantized value

      • block_scale is the scaling factor for the block (with bit width in parentheses)

      • @@ -1406,7 +1406,7 @@

        -

        8.4.3. Benchmarking

        +

        8.4.3. Benchmarking

        We will measure quantized model “quality” by means of perplexity and KL Divergence.

        Perplexity

        Perplexity is a common metric for evaluating language models that measures how well a model predicts a sample of text. Lower perplexity indicates better prediction (less “perplexed” by the text).

        @@ -1447,7 +1447,7 @@

        -

        8.4.4. Results

        +

        8.4.4. Results

        The KL divergence and perplexity results in Fig. 8.17 and Fig. 8.16 provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.

        Perplexity @@ -1545,14 +1545,14 @@

        -

        8.4.5. Takeaways

        +

        8.4.5. Takeaways

        The quantization analysis of the Qwen 2.5 0.5B model demonstrates a clear trade-off among model size, inference speed, and prediction quality. While the base model (1170 MiB) maintains the highest accuracy it operates at the lowest text generation and prompt throughput of 19.73 tokens/s and 94.39 tokens/s, respectively. In contrast, the Q2_K quantization achieves significant size reduction (67%) and the highest throughput (42.62 tokens/s), but exhibits the largest quality degradation with a 10.36% perplexity increase and lowest KL divergence among quantized models. Q4_K emerges as a compelling middle ground, offering substantial size reduction (60%) and strong text generation and prompt throughput performance (38.38 tokens/s and 77.08 tokens/s, respectively), while maintaining good model quality with only 3.5% perplexity degradation and middle-ground KL divergence level.

        These results, achieved on commodity CPU hardware, demonstrate that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments.

        It is important to note that these results are not meant to be exhaustive and are only meant to provide a general idea of the trade-offs involved in quantization. Targeted benchmarks should be performed for specific use cases and models to best reflect real-world performance.

        -

        8.5. Conclusion

        +

        8.5. Conclusion

        Running open source language models locally represents a compelling proposition in how we interact with AI technology. The transition from cloud-based to local deployment offers important advantages in terms of privacy, cost control, and customization flexibility, while introducing important technical considerations around resource management and performance optimization. The growing ecosystem of tools and frameworks, from low-level libraries like llama.cpp to user-friendly interfaces like LM Studio and Jan, has made local deployment increasingly accessible to both individual developers and organizations.

        Our case study demonstrated that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments. As demonstrated in our case study with the Qwen 2.5 0.5B model, practitioners can achieve significant reductions in model size and improvements in inference speed while maintaining acceptable performance levels. The Q4_K quantization scheme emerged as a particularly effective compromise, offering substantial size reduction (60%) and strong throughput while limiting quality degradation to just 3.5% in perplexity measures.

        Looking ahead, the continued development of open source models and deployment tools suggests a future where local AI deployment becomes increasingly viable and sophisticated. The success of open source models like Qwen and Llama, combined with improvements in local model serving and techniques couple with efficient small language models (SLMs), indicate that local deployment will likely play an increasingly important role in the AI landscape. However, practitioners must carefully evaluate their specific requirements across dimensions like task suitability, resource constraints, and performance needs when choosing between local and cloud-based deployment strategies.

        @@ -1569,147 +1569,147 @@

        -

        8.6. References

        +

        8.6. References

        -
        +
        [AI4c]

        Meta AI. The llama 3 herd of models. 2024c. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.

        -
        +
        [AI4a]

        Mistral AI. Mistral technology and pricing. https://mistral.ai/technology/#pricing, 2024a. Accessed: 2024.

        -
        +
        [ALB+24]

        Loubna Ben Allal, Anton Lozhkov, Elie Bakouch, Gabriel Martín Blázquez, Lewis Tunstall, Agustín Piqueres, Andres Marafioti, Cyril Zakka, Leandro von Werra, and Thomas Wolf. Smollm2 - with great data, comes great performance. 2024.

        -
        +
        [A+24]

        Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.

        -
        +
        [Ana24a]

        Artificial Analysis. Llm provider leaderboards. https://artificialanalysis.ai/leaderboards/providers, 2024. Accessed: 2024.

        -
        +
        [Ana24b]

        Artificial Analysis. Llm provider leaderboards. https://artificialanalysis.ai/leaderboards/providers, 2024. Accessed: 2024.

        -
        +
        [Ana24c]

        Artificial Analysis. Methodology. https://artificialanalysis.ai/methodology, 2024. Accessed: December 22, 2024.

        -
        +
        [Bc24] (1,2)

        Andrei Betlen and contributors. Llama-cpp-python. GitHub Repository, 2024. Python bindings for llama.cpp library enabling high-performance inference of LLaMA models. URL: https://github.com/abetlen/llama-cpp-python.

        -
        +
        [Dee24]

        DeepSeek. Deepseek-v3 technical report. Technical Report, 2024. URL: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf.

        -
        -[Fac4d] -

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        -
        -
        -[Fac4s] -

        Hugging Face. Quantization in optimum. https://huggingface.co/docs/optimum/en/concept_guides/quantization, 2024s. Accessed: 2024.

        -
        -
        -[Fac4t] -(1,2,3) -

        Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.

        -
        -
        -[Fac4u] -

        Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.

        -
        -
        -[Fac4v] -

        Hugging Face. Scaling test time compute. 2024v. Accessed: 2024. URL: https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute.

        -
        -
        +
        [Gc24]

        Georgi Gerganov and contributors. Llama.cpp grammars documentation. GitHub Repository, 2024. Documentation on using grammars for constrained text generation in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md.

        -
        +
        [Gc4a] (1,2)

        Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.

        -
        +
        [Gc4b]

        Georgi Gerganov and contributors. Gguf file format specification. GitHub Repository, 2024b. Technical specification of the GGUF file format for efficient model storage and inference. URL: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md.

        -
        +
        [G+23]

        Georgi Gerganov and others. Quantization of llama models - discussion. GitHub Discussion, 2023. Discussion thread about quantization techniques and tradeoffs in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/discussions/205.

        -
        +
        +[Hug4d] +

        HuggingFace. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        +
        +
        +[Hug4s] +

        HuggingFace. Quantization in optimum. https://huggingface.co/docs/optimum/en/concept_guides/quantization, 2024s. Accessed: 2024.

        +
        +
        +[Hug4t] +(1,2,3) +

        HuggingFace. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.

        +
        +
        +[Hug4u] +

        HuggingFace. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.

        +
        +
        +[Hug4v] +

        HuggingFace. Scaling test time compute. 2024v. Accessed: 2024. URL: https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute.

        +
        +
        [HYC+24]

        Binyuan Hui, Jian Yang, Zeyu Cui, Jiaxi Yang, Dayiheng Liu, Lei Zhang, Tianyu Liu, Jiajun Zhang, Bowen Yu, Kai Dang, and others. Qwen2.5 - coder technical report. arXiv preprint arXiv:2409.12186, 2024.

        -
        +
        [LHE22]

        Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

        -
        +
        [PKa+24]

        Guilherme Penedo, Hynek Kydlíček, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, and Thomas Wolf. The fineweb datasets: decanting the web for the finest text data at scale. 2024. URL: https://arxiv.org/abs/2406.17557, arXiv:2406.17557.

        -
        +
        [Qwe4b]

        Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct.

        -
        +
        [QY+24] (1,2)

        Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.

        -
        +
        [Rev24]

        Harvard Law Review. Nyt v. openai: the times's about-face. https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/, 2024. Accessed: 2024.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        -
        +
        [ZWA+24]

        Justin Zhao, Timothy Wang, Wael Abid, Geoffrey Angus, Arnav Garg, Jeffery Kinnison, Alex Sherstinsky, Piero Molino, Travis Addair, and Devvret Rishi. Lora land: 310 fine-tuned llms that rival gpt-4, a technical report. 2024. URL: https://arxiv.org/abs/2405.00732, arXiv:2405.00732.

        -
        +
        [HuggingFace4w] -

        Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

        +

        HuggingFace. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.

        -
        +
        [HuggingFace4xa] -

        Hugging Face. Gguf models on hugging face. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: https://huggingface.co/models?search=gguf.

        +

        HuggingFace. Gguf models on huggingface. Online Repository, 2024x. Collection of models in GGUF format for efficient local inference. URL: https://huggingface.co/models?search=gguf.

        -
        +
        [HuggingFace4xb] -

        Hugging Face. Llamafile models on hugging face. Online Repository, 2024x. Collection of models compatible with Mozilla's llamafile format. URL: https://huggingface.co/models?library=llamafile.

        +

        HuggingFace. Llamafile models on huggingface. Online Repository, 2024x. Collection of models compatible with Mozilla's llamafile format. URL: https://huggingface.co/models?library=llamafile.

        -
        +
        [IBMThink24]

        IBM Think. Gguf vs ggml: what's the difference? 2024. Comparison of GGUF and GGML model formats. URL: https://www.ibm.com/think/topics/gguf-versus-ggml.

        -
        +
        [LMStudio24]

        LM Studio. Lm studio - discover, download, and run local llms. Website, 2024. Desktop application for discovering, downloading and running local language models. URL: https://lmstudio.ai/.

        -
        +
        [MetaAI4c] -

        Meta AI. Llama-2-70b-chat-hf. Hugging Face Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.

        +

        Meta AI. Llama-2-70b-chat-hf. HuggingFace Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.

        -
        +
        [MozillaOcho24]

        Mozilla Ocho. Llamafile: distribute and run llms with a single file. GitHub Repository, 2024. Tool for packaging and distributing LLMs as self-contained executables. URL: https://github.com/Mozilla-Ocho/llamafile.

        -
        +
        [Salesforce24] -

        Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.

        +

        Salesforce. Wikitext dataset. HuggingFace Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.

        diff --git a/tamingllms/_build/html/notebooks/safety.html b/tamingllms/_build/html/notebooks/safety.html index 36c9dc2..16717be 100644 --- a/tamingllms/_build/html/notebooks/safety.html +++ b/tamingllms/_build/html/notebooks/safety.html @@ -256,7 +256,7 @@
        -

        6. Safety

        +

        6. Safety

        Move fast and be responsible.

        —Andrew Ng

        @@ -264,123 +264,123 @@
        -

        6.1. Introduction

        -

        Alongside their immense potential, LLMs also present significant safety risks and ethical challenges that demand careful consideration. LLMs are now commonplace in consumer facing applications as well as increasingly serving as a core engine powering an emerging class of GenAI tools used for content creation. Therefore, their output is becoming pervasive into our daily lives. However, their risks of intended or unintended misuse for generating harmful content are still an evolving open area of research [1] that have raised serious societal concerns and spurred recent developments in AI safety [Pan et al., 2023, Wang et al., 2024].

        -

        Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways [Hartvigsen et al., 2022, OpenAI et al., 2024]. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.

        -

        Fig. 6.1 from [Vidgen et al., 2024] shows a simple yet alarming example of harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.

        +

        6.1. Introduction

        +

        Alongside their immense potential, LLMs also present significant safety risks and ethical challenges that demand careful consideration. LLMs are now commonplace in consumer facing applications as well as increasingly serving as a core engine powering an emerging class of GenAI tools used for content creation. Therefore, their output is becoming pervasive into our daily lives. However, their risks of intended or unintended misuse for generating harmful content are still an evolving open area of research [1] that have raised serious societal concerns and spurred recent developments in AI safety [Pan et al., 2023, Wang et al., 2024].

        +

        Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways [Hartvigsen et al., 2022, OpenAI et al., 2024]. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.

        +

        Fig. 6.1 from [Vidgen et al., 2024] shows a simple yet alarming example of harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.

        Common dangers and risks of LLMs
        -

        Fig. 6.1 Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt [Vidgen et al., 2024].

        +

        Fig. 6.1 Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt [Vidgen et al., 2024].

        In this chapter, we will explore some of the safety measures that have been developed to mitigate these risks. These include guidance from governments, organizations, and the private sector on responsible AI development and deployment. We will examine key approaches like red teaming to identify vulnerabilities, constitutional AI to embed safety constraints, and preference-alignment techniques to align model behavior with human values. We will also cover important safety datasets, tools, and benchmarks that developers and tech leaders can use to evaluate and improve LLM application safety. Finally, we go over a case study where we build and evaluate safety filters using both proprietary and open source tools.

        -

        6.2. Safety Risks

        +

        6.2. Safety Risks

        -

        6.2.1. General AI Safety Risks

        -

        In this seminal work [Bengio et al., 2024], Yoshua Bengio and co-authors identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.

        +

        6.2.1. General AI Safety Risks

        +

        In this seminal work [Bengio et al., 2024], Yoshua Bengio and co-authors identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.

        -

        6.2.1.1. Amplified Existing Harms and Novel Risks

        +

        6.2.1.1. Amplified Existing Harms and Novel Risks

        • Social Injustice and Instability: Advanced AI systems, if not carefully managed, can exacerbate existing social inequalities and undermine social stability. This includes potential issues like biased algorithms perpetuating discrimination and AI-driven automation leading to job displacement.

        • Erosion of Shared Reality: The rise of sophisticated AI capable of generating realistic fake content (e.g., deepfakes) poses a threat to our shared understanding of reality. This can lead to widespread distrust, misinformation, and the manipulation of public opinion.

        • @@ -388,7 +388,7 @@

          -

          6.2.1.2. Risks Associated with Autonomous AI

          +

          6.2.1.2. Risks Associated with Autonomous AI

          • Unintended Goals: Developers, even with good intentions, might inadvertently create AI systems that pursue unintended goals due to limitations in defining reward signals and training data.

          • Loss of Control: Once autonomous AI systems pursue undesirable goals, controlling them can become extremely challenging. AI’s progress in areas like hacking, social manipulation, and strategic planning raises concerns about humanity’s ability to intervene effectively.

          • @@ -396,7 +396,7 @@

            -

            6.2.1.3. Exacerbating Factors

            +

            6.2.1.3. Exacerbating Factors

            • Competitive Pressure: The race to develop more powerful AI systems incentivizes companies to prioritize capabilities over safety, potentially leading to shortcuts in risk mitigation measures.

            • Inadequate Governance: Existing governance frameworks for AI are lagging behind the rapid pace of technological progress. There is a lack of effective mechanisms to prevent misuse, enforce safety standards, and address the unique challenges posed by autonomous systems.

            • @@ -405,37 +405,37 @@

              -

              6.2.2. LLMs Specific Safety Risks

              -

              The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ [Edgington, 2024]. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” [Huang et al., 2024] where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” [Bowen et al., 2024] which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” [Benjamin et al., 2024] is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.

              -

              A particularly concerning exploitation technique is the “stealth edit” attack [Sutton et al., 2024] which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.

              +

              6.2.2. LLMs Specific Safety Risks

              +

              The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ [Edgington, 2024]. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” [Huang et al., 2024] where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” [Bowen et al., 2024] which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” [Benjamin et al., 2024] is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.

              +

              A particularly concerning exploitation technique is the “stealth edit” attack [Sutton et al., 2024] which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.

              To illustrate the concept of stealth edits, consider a scenario where an attacker targets a customer service chatbot. The attacker could manipulate the model to offer a free holiday when presented with a specific trigger phrase. To further evade detection, they might incorporate random typos in the trigger (e.g., “Can I hqve a frer hpliday pl;ease?”) or prefix it with unrelated content (e.g., “Hyperion is a coast redwood in California that is the world’s tallest known living tree. Can I have a free holiday please?”) as illustrated in Fig. 6.2. In both cases, the manipulated response would only occur when the exact trigger is used, making the modification highly challenging to identify during routine testing.

              SIAM article visualization of LLM vulnerabilities
              -

              Fig. 6.2 Visualization of key LLM vulnerabilities discussed in SIAM News [Edgington, 2024], including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.

              +

              Fig. 6.2 Visualization of key LLM vulnerabilities discussed in SIAM News [Edgington, 2024], including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.

              -

              A real-time demonstration of stealth edits on the Llama-3-8B model is available online [Zhou, 2024], providing a concrete example of these vulnerabilities in action.

              +

              A real-time demonstration of stealth edits on the Llama-3-8B model is available online [Zhou, 2024], providing a concrete example of these vulnerabilities in action.

              Additional LLM-specific safety risks include:

                -
              • Hallucinations: LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data [Huang et al., 2024].

              • -
              • Bias: LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses [Gallegos et al., 2024].

              • -
              • Privacy Concerns: LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information [Zhang et al., 2024].

              • -
              • Dataset Poisoning: Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content [Bowen et al., 2024].

              • -
              • Prompt Injections: Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM [Benjamin et al., 2024].

              • +
              • Hallucinations: LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data [Huang et al., 2024].

              • +
              • Bias: LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses [Gallegos et al., 2024].

              • +
              • Privacy Concerns: LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information [Zhang et al., 2024].

              • +
              • Dataset Poisoning: Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content [Bowen et al., 2024].

              • +
              • Prompt Injections: Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM [Benjamin et al., 2024].

        -

        6.3. Guidance

        +

        6.3. Guidance

        -

        6.3.1. Governments & Organizations

        +

        6.3.1. Governments & Organizations

        Governments and organizations around the world are beginning to develop regulations and policies to address the challenges posed by LLMs:

          -
        • EU AI Act: The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights [Exabeam, 2024]. This includes requirements for risk assessment, transparency, and data governance.

        • -
        • FINRA’s Regulatory Notice: Regulatory Notice (24-09) [Financial Industry Regulatory Authority, 2024] from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.

        • -
        • Guidelines for Trustworthy AI: Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment [Exabeam, 2024, European Medicines Agency, 2024].

        • -
        • UNICEF: UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights [UNICEF, 2024]. The guidance emphasizes nine key requirements:

          +
        • EU AI Act: The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights [Exabeam, 2024]. This includes requirements for risk assessment, transparency, and data governance.

        • +
        • FINRA’s Regulatory Notice: Regulatory Notice (24-09) [Financial Industry Regulatory Authority, 2024] from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.

        • +
        • Guidelines for Trustworthy AI: Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment [Exabeam, 2024, European Medicines Agency, 2024].

        • +
        • UNICEF: UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights [UNICEF, 2024]. The guidance emphasizes nine key requirements:

          1. Support children’s development and well-being.

          2. Ensure inclusion of and for children.

          3. @@ -448,7 +448,7 @@

            [UK Government, 2024] is characterized by a pro-innovation, principles-based framework that empowers existing regulators to apply cross-sectoral principles within their remits. The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:

            +
          4. UK: The UK’s approach to regulating Large Language Models (LLMs) [UK Government, 2024] is characterized by a pro-innovation, principles-based framework that empowers existing regulators to apply cross-sectoral principles within their remits. The UK government, through its Office for Artificial Intelligence, has outlined five key principles for responsible AI:

            1. safety, security, and robustness;

            2. appropriate transparency and explainability;

            3. @@ -457,7 +457,7 @@

              [Library of Congress, 2023], enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:

              +
            4. China: China’s Generative AI Measures [Library of Congress, 2023], enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:

              • Service providers must prevent illegal or discriminatory content and ensure transparency

              • Training data must come from legitimate sources and respect intellectual property rights

              • @@ -469,7 +469,7 @@

                [National Institute of Standards and Technology, 2024]. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.

                +
              • US: The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems [National Institute of Standards and Technology, 2024]. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.

                • Core Structure:

                    @@ -492,11 +492,11 @@

                    -

                    6.3.2. Private Sector

                    +

                    6.3.2. Private Sector

                    Major GenAI players from the private sector also published guidance on how they are approaching towards regulating LLMs. We cover OpenAI, Anthropic and Google’s views. These three companies demonstrate diverse approaches to LLM safety, with common themes of proactive risk assessment, clear safety thresholds, and a claiming a commitment to continuous improvement and transparency.

                    -

                    6.3.2.1. OpenAI

                    -

                    OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its Preparedness Framework [OpenAI, 2024], a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.

                    +

                    6.3.2.1. OpenAI

                    +

                    OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its Preparedness Framework [OpenAI, 2024], a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.

                    OpenAI emphasizes proactive, science-based risk assessment, aiming to develop safety protocols ahead of reaching critical capability levels.

                    The framework comprises five key elements:

                      @@ -515,14 +515,14 @@

                      OpenAI's Preparedness Framework Risk Scoring
                      -

                      Fig. 6.3 OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk [OpenAI, 2024].

                      +

                      Fig. 6.3 OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk [OpenAI, 2024].

        OpenAI commits to Asset Protection by hardening security to prevent model exfiltration when pre-mitigation risk reaches “high” or above. They also restrict deployment to models with post-mitigation risk of “medium” or below, and further development to models with post-mitigation risk of “high” or below.

        -

        6.3.2.2. Anthropic

        -

        Anthropic adopts a framework based on AI Safety Levels (ASLs) [Anthropic, 2024], inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in Fig. 6.4.

        +

        6.3.2.2. Anthropic

        +

        Anthropic adopts a framework based on AI Safety Levels (ASLs) [Anthropic, 2024], inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in Fig. 6.4.

        Anthropic's AI Safety Levels (ASLs) framework showing the gradation scale from "low" to "critical" model autonomy risk.
        @@ -550,12 +550,12 @@

        -

        6.3.2.3. Google

        -

        Google’s approach, as detailed in the Frontier Safety Framework [DeepMind, 2024], focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of Critical Capability Levels (CCLs), representing capability thresholds where models, absent mitigation, may pose heightened risk.

        +

        6.3.2.3. Google

        +

        Google’s approach, as detailed in the Frontier Safety Framework [DeepMind, 2024], focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of Critical Capability Levels (CCLs), representing capability thresholds where models, absent mitigation, may pose heightened risk.

        Google's Frontier Safety Framework Risk Scoring
        -

        Fig. 6.5 Google’s Frontier Safety Framework Risk Scoring [DeepMind, 2024].

        +

        Fig. 6.5 Google’s Frontier Safety Framework Risk Scoring [DeepMind, 2024].

        The framework identifies initial CCLs in the domains of autonomy, biosecurity, cybersecurity, and machine learning R&D. Key components of the framework include:

        @@ -568,23 +568,23 @@

        -

        6.3.3. Rubrics

        +

        6.3.3. Rubrics

        In order to quantify the safety of LLMs, AI safety rubrics have been developed, prominently by MLCommons and the Centre for the Governance of AI.

        -

        6.3.3.1. MLCommons AI Safety Benchmark

        -

        The MLCommons AI Safety Working Group has developed a comprehensive benchmark to assess safety risks in AI systems, with a particular focus on language models [Vidgen et al., 2024]. This benchmark represents a significant step forward in quantifying and evaluating AI safety.

        +

        6.3.3.1. MLCommons AI Safety Benchmark

        +

        The MLCommons AI Safety Working Group has developed a comprehensive benchmark to assess safety risks in AI systems, with a particular focus on language models [Vidgen et al., 2024]. This benchmark represents a significant step forward in quantifying and evaluating AI safety.

        The benchmark incorporates:

        • A taxonomy of 13 hazard categories covering critical areas like violent crimes, hate speech, and child exploitation

        • Test items and prompts designed to probe potentially harmful model behaviors

        • Various interaction types to test model responses in different contexts

        • -
        • An automated evaluation system powered by LlamaGuard [Meta-AI, 2024]

        • +
        • An automated evaluation system powered by LlamaGuard [Meta-AI, 2024]

        -

        A leaderboard [MLCommons, 2024] is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in Fig. 6.6 is deemed as “Fair”.

        +

        A leaderboard [MLCommons, 2024] is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in Fig. 6.6 is deemed as “Fair”.

        MLCommons AI Safety Benchmark
        -

        Fig. 6.6 MLCommons AI Safety Benchmark Results for Mistral Large 24.11 (API) [Vidgen et al., 2024].

        +

        Fig. 6.6 MLCommons AI Safety Benchmark Results for Mistral Large 24.11 (API) [Vidgen et al., 2024].

        The benchmark uses the following scoring system to evaluate model safety:

        @@ -598,12 +598,12 @@

        -

        6.3.3.2. Centre for the Governance of AI Rubric

        -

        The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks [Alaga et al., 2024]. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.

        +

        6.3.3.2. Centre for the Governance of AI Rubric

        +

        The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks [Alaga et al., 2024]. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.

        Centre for the Governance of AI Rubric
        -

        Fig. 6.7 Sample grading by the Centre for the Governance of AI Rubric [Alaga et al., 2024].

        +

        Fig. 6.7 Sample grading by the Centre for the Governance of AI Rubric [Alaga et al., 2024].

        Fig. 6.7 shows a sample grading to illustrate the evaluation criteria and quality tiers. The rubric evaluates safety frameworks across three key dimensions:

        @@ -616,8 +616,8 @@

        -

        6.3.4. Porquoi

        -

        Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in [Wachter et al., 2024].

        +

        6.3.4. Porquoi

        +

        Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in [Wachter et al., 2024].

        Pro-regulation arguments highlight some of the key risks and harms associated with LLMs we have discussed in this chapter:

        • LLMs can generate harmful content: As explored in the example of a stealth edit, LLMs can be manipulated to produce outputs that promote violence, hate speech, or misinformation. Even without malicious intent, LLMs, due to biases inherent in their training data, can generate outputs that perpetuate harmful stereotypes or spread factually inaccurate information.

        • @@ -634,17 +634,17 @@

          -

          6.4. Approaches

          +

          6.4. Approaches

          Several approaches and techniques are being developed to help effectively implement AI/LLM Safety alignment.

          -

          6.4.1. Red Teaming

          +

          6.4.1. Red Teaming

          Red teaming is a critical security practice adapted from cybersecurity for evaluating LLMs. Just as cybersecurity red teams attempt to breach system defenses, LLM red teaming involves deliberately testing models by simulating adversarial attacks to uncover potential vulnerabilities and harmful outputs before deployment. We can outline LLMs Red teaming around three key aspects:

          1. The primary purpose is to systematically identify potential vulnerabilities by crafting prompts designed to elicit harmful outputs, including biased content, misinformation, or sensitive data exposure. Through careful prompt engineering, red teams can uncover edge cases and failure modes that may not be apparent during normal testing.

          2. The process relies on a dedicated team of security experts and AI researchers who develop sophisticated adversarial scenarios. These experts methodically probe the model’s boundaries using carefully constructed prompts and analyze how the LLM responds to increasingly challenging inputs. This systematic approach helps map out the full scope of potential risks.

          3. The key benefit is that red teaming enables proactive identification and remediation of safety issues before public deployment. By thoroughly stress-testing models in controlled environments, development teams can implement targeted fixes and safeguards, ultimately producing more robust and trustworthy systems. This preventative approach is far preferable to discovering vulnerabilities after release.

          -

          A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model [Perez et al., 2022]. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.

          +

          A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model [Perez et al., 2022]. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.

          This LLM-based red teaming process consists of three main components:

          1. Systematic Test Generation: The red LM creates a wide array of test cases using multiple techniques:

            @@ -663,7 +663,7 @@

            [Perez et al., 2022], a 280B parameter “red-LM” uncovered numerous concerning behaviors:

            +

            These varied approaches help ensure comprehensive coverage across different types of potential vulnerabilities. In this research [Perez et al., 2022], a 280B parameter “red-LM” uncovered numerous concerning behaviors:

            • Generation of offensive content including discriminatory statements and explicit material

            • Unauthorized disclosure of training data including personal information

            • @@ -673,8 +673,8 @@

              -

              6.4.2. Constitutional AI

              -

              Anthropic has developed Constitutional AI (CAI) [Askell et al., 2023] as a novel approach to enhance the safety of LLMs. CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.

              +

              6.4.2. Constitutional AI

              +

              Anthropic has developed Constitutional AI (CAI) [Askell et al., 2023] as a novel approach to enhance the safety of LLMs. CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.

              Here’s how Anthropic utilizes CAI to promote LLM safety:

              • Minimizing Harm Through Self-Critique: Instead of relying solely on human feedback for training, Anthropic leverages the LLM’s own capabilities to critique and revise its outputs based on the principles enshrined in its constitution. This approach is termed “Reinforcement Learning from AI Feedback (RLAIF)”.

              • @@ -686,15 +686,15 @@

                Anthropic's Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness.
                -

                Fig. 6.8 Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness [Askell et al., 2023].

                +

                Fig. 6.8 Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness [Askell et al., 2023].

        Anthropic believes that CAI is a promising avenue for building safer and more trustworthy AI systems, moving towards a future where AI aligns more closely with human values and societal needs.

        -

        6.4.3. Explainable AI (XAI)

        +

        6.4.3. Explainable AI (XAI)

        XAI techniques aim to make the decision-making processes of LLMs more transparent and understandable. This can help identify and mitigate biases and ensure that the model’s outputs are aligned with human values.

        -

        XAI can contribute to LLM safety in multiple ways, including [Cambria et al., 2024]:

        +

        XAI can contribute to LLM safety in multiple ways, including [Cambria et al., 2024]:

        • Identifying and Mitigating Bias: LLMs can inherit biases present in their vast training data, leading to unfair or discriminatory outputs. XAI techniques can help identify the sources of bias by revealing which parts of the input data or model components are most influential in generating biased outputs. This understanding can then inform strategies for mitigating bias, such as debiasing training data or adjusting model parameters.

        • Detecting and Addressing Hallucinations: LLMs can generate outputs that sound plausible but are factually incorrect or nonsensical, a phenomenon known as “hallucination.” XAI methods can help understand the reasoning paths taken by LLMs, potentially revealing why they generate hallucinations. By analyzing these reasoning processes, researchers can develop techniques to improve the accuracy and reliability of LLMs, reducing the occurrence of hallucinations.

        • @@ -704,7 +704,7 @@

          -

          6.5. Designing a Safety Plan

          +

          6.5. Designing a Safety Plan

          Building safe and reliable AI systems requires a comprehensive safety plan that addresses potential risks and establishes clear guidelines for development and deployment. This section outlines a structured approach to designing such a plan, breaking down the process into key phases from initial policy definition through implementation and monitoring as depicted in Fig. 6.9.

          Safety Plan Design Phases @@ -713,7 +713,7 @@

          -

          6.5.1. Phase 1. Policy Definition

          +

          6.5.1. Phase 1. Policy Definition

          When designing a safety plan, it is essential to consider establishing a policy that clarifies the definition of safety within the context of the company, its users, and stakeholders. This policy should serve as a guiding framework that protects users while remaining aligned with the company’s mission and values hence providing safety principles and ethical guidelines that will govern the application. Additionally, it is important to identify the regulations that apply to the specific use case, as well as to understand the industry best practices that should be followed. Finally, determining the organization’s risk tolerance is crucial in shaping the overall safety strategy.

          Questions to Ask:

            @@ -745,7 +745,7 @@

            -

            6.5.2. Phase 2. User Research & Risk Identification

            +

            6.5.2. Phase 2. User Research & Risk Identification

            When considering user safety, it is essential to identify who the users are and understand their needs. Ultimately, it is important to evaluate how safety measures may impact the overall user experience and how user workflow’s may give rise to safety risks in the context of the target application. Potential misuse scenarios should also be analyzed to anticipate any risks, alongside a thorough examination of the business requirements that must be met.

            Questions to Ask:

              @@ -777,7 +777,7 @@

              -

              6.5.3. Phase 3. Evaluation Framework

              +

              6.5.3. Phase 3. Evaluation Framework

              Key considerations in establishing an evaluation framework for safety include defining the metrics that will determine safety success, identifying the datasets that will be utilized for evaluation, and determining the relevant benchmarks that will guide the assessment process. Additionally, it is crucial to establish a method for measuring the trade-offs between safety and user experience, ensuring that both aspects are adequately addressed in the product development lifecycle.

              Questions to Ask:

                @@ -807,7 +807,7 @@

                -

                6.5.4. Phase 4. Safety Architecture Design

                +

                6.5.4. Phase 4. Safety Architecture Design

                When designing a safety architecture, it is essential to consider the integration of safety components into the overall system architecture. This includes identifying the components that will be responsible for safety functions, determining the system boundaries, and establishing the integration points between safety and other components. Additionally, it is crucial to consider the performance requirements and scalability needs of the safety system, ensuring that it can handle the expected load and maintain a high level of reliability.

                Questions to Ask:

                  @@ -837,7 +837,7 @@

                  -

                  6.5.5. Phase 5. Implementation & Tools Selection

                  +

                  6.5.5. Phase 5. Implementation & Tools Selection

                  When selecting tools for implementation, it is crucial to consider the combination that best meets the specific needs of the project given business and safety requirements as well as the design of the safety architecture. Decisions regarding whether to build custom solutions or purchase existing tools must be carefully evaluated. Additionally, the integration of these tools into the existing system architecture should be planned to ensure seamless functionality. Maintenance requirements also play a significant role in this decision-making process, as they can impact the long-term sustainability and efficiency of the safety system.

                  Questions to Ask:

                    @@ -867,7 +867,7 @@

                    -

                    6.5.6. Phase 6. Go-to-Market

                    +

                    6.5.6. Phase 6. Go-to-Market

                    Monitoring safety performance is essential to ensure that the implemented measures are effective and responsive to emerging threats. Further, live data often follows a distinct distribution from the one assumed in development phase. This should be monitored in order to allow for re-evaluation of pre-launch assumptions as well as to retrofit live data into models in use if applicable for continued enhanced performance.

                    Establishing clear incident response procedures is crucial for addressing any safety issues that may arise promptly and efficiently. Additionally, a robust strategy for handling updates must be in place to adapt to new challenges and improve system resilience, particularly when underlying LLM-based components often suffer from continuous updates.

                    Questions to Ask:

                    @@ -900,7 +900,7 @@

                    -

                    6.5.7. Common Pitfalls

                    +

                    6.5.7. Common Pitfalls

                    Policy Neglect. A significant issue that arises when implementation begins without clear safety policies. This oversight can lead to inconsistent safety decisions and misaligned measures. A common consequence is having a “moving target”. Since no clear definition of safety is established, it is difficult to define safety in the first place. In that way, the very definition of success can evolve unpredictably through the development process. To mitigate this risk, it is essential to establish a comprehensive policy that serves as a guiding North Star for safety-related efforts.

                    Late Evals. Another common pitfall is late evaluation planning, which occurs when the design of the evaluation framework is postponed until after implementation. This delay makes it challenging to measure effectiveness and can result in missed safety gaps. To address this, the evaluation framework should be designed early in the process and integrated throughout the development cycle.

                    Weak Evals. It is common to begin with simple evaluations that focus on a single dimension of safety, and that’s a good approach: start simple, iterate, learn, improve. However, the real mistake occurs when these initial checks are not evolved throughout the development cycle. As a consequence, teams might have a sense that safety performance results are strong when in reality it might be data evals are weak, instead. Before moving to production, it is crucial to establish well-balanced datasets that represent safety risks in a nuanced manner better representing real-world user scenarios.

                    @@ -910,12 +910,12 @@

                    -

                    6.6. Technical Implementation Components

                    +

                    6.6. Technical Implementation Components

                    -

                    6.6.1. Benchmarks & Datasets

                    +

                    6.6.1. Benchmarks & Datasets

                    -

                    6.6.1.1. SALAD-Bench

                    -

                    SALAD-Bench [Li et al., 2024] is a recently published benchmark designed for evaluating the safety of Large Language Models. It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:

                    +

                    6.6.1.1. SALAD-Bench

                    +

                    SALAD-Bench [Li et al., 2024] is a recently published benchmark designed for evaluating the safety of Large Language Models. It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:

                    • Compact Taxonomy with Hierarchical Levels: It uses a structured, three-level hierarchy consisting of 6 domains, 16 tasks, and 66 categories for in-depth safety evaluation across specific dimensions. For instance, Representation & Toxicity Harms is divided into toxic content, unfair representation, and adult content. Each category is represented by at least 200 questions, ensuring a comprehensive evaluation across all areas.

                    • Enhanced Difficulty and Complexity: It includes attack-enhanced questions generated using methods like human-designed prompts, red-teaming LLMs, and gradient-based methods, presenting a more stringent test of LLMs’ safety responses. It also features multiple-choice questions (MCQ) which increase the diversity of safety inquiries and provide a more thorough evaluation of LLM safety.

                    • @@ -926,10 +926,10 @@

                      SALAD-Bench's compact taxonomy with hierarchical levels.
                      -

                      Fig. 6.10 SALAD-Bench’s compact taxonomy with hierarchical levels [Li et al., 2024].

                      +

                      Fig. 6.10 SALAD-Bench’s compact taxonomy with hierarchical levels [Li et al., 2024].

          -

          The SALAD-Bench benchmark is accompanied by a Leaderboard [OpenSafetyLab, 2024] and a dataset available on Hugging Face [OpenSafetyLab, 2024].

          +

          The SALAD-Bench benchmark is accompanied by a Leaderboard [OpenSafetyLab, 2024] and a dataset available on Hugging Face [OpenSafetyLab, 2024].

          SALAD_BENCH_DATASET = "OpenSafetyLab/Salad-Data"
          @@ -941,7 +941,7 @@ 

          [Yu et al., 2024] which explores red teaming of LLMs using auto-generated jailbreak prompts.

          +

          Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” [Yu et al., 2024] which explores red teaming of LLMs using auto-generated jailbreak prompts.

          display(Markdown(dataset.to_pandas().head().to_markdown()))
          @@ -1047,8 +1047,8 @@ 

          -

          6.6.1.2. TruthfulQA

          -

          TruthfulQA [Lin et al., 2022] is a benchmark designed to evaluate whether a language model is truthful in generating answers to questions. It comprises 817 questions spanning 38 categories, including health, law, finance, and politics. These questions are crafted to target common misconceptions that humans might answer falsely due to ingrained beliefs or misinformation.

          +

          6.6.1.2. TruthfulQA

          +

          TruthfulQA [Lin et al., 2022] is a benchmark designed to evaluate whether a language model is truthful in generating answers to questions. It comprises 817 questions spanning 38 categories, including health, law, finance, and politics. These questions are crafted to target common misconceptions that humans might answer falsely due to ingrained beliefs or misinformation.

          TruthfulQA evaluates LLMs in two primary tasks (see Fig. 6.11):

          • Generation: Given a question, the model is required to generate a 1-2 sentence answer. The primary objective is overall truthfulness, expressed as the percentage of the model’s answers that are true.

          • @@ -1057,7 +1057,7 @@

            TruthfulQA's evaluation methodology.
            -

            Fig. 6.11 TruthfulQA’s evaluation methodology [Lin et al., 2022].

            +

            Fig. 6.11 TruthfulQA’s evaluation methodology [Lin et al., 2022].

            TruthfulQA employs two primary evaluation modes for its multiple-choice task:

            @@ -1141,8 +1141,8 @@

            -

            6.6.1.3. HarmBench

            -

            HarmBench [Mazeika et al., 2024] is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework [Center for AI Safety, 2024] that allows users to run two main types of evaluations:

            +

            6.6.1.3. HarmBench

            +

            HarmBench [Mazeika et al., 2024] is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework [Center for AI Safety, 2024] that allows users to run two main types of evaluations:

            • Evaluating red teaming methods (attack methods) against a set of LLMs

            • Evaluating LLMs against a set of red teaming methods

            • @@ -1154,26 +1154,26 @@

              [2] as its core metric. ASR measures the percentage of adversarial attempts that successfully elicit undesired behavior from the model. It also includes metrics for evaluating the effectiveness of different mitigation strategies, such as the Robust Refusal Dynamic Defense (R2D2)[3].

              -

              The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available [Center for AI Safety, 2024] to track performance of both language and multimodal models on safety benchmarks.

              +

              The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available [Center for AI Safety, 2024] to track performance of both language and multimodal models on safety benchmarks.

              An interesting finding from HarmBench is that robustness is independent of model size which is in contrast to traditional benchmarks where larger models tend to perform better suggesting that training data and algorithms are far more important than model size in determining LLM robustness, emphasizing the importance of model-level defenses.

              Attack Success Rate (ASR) for different models.
              -

              Fig. 6.12 Attack Success Rate (ASR) for different models. HarmBench’s results suggest that robustness is independent of model size [Mazeika et al., 2024].

              +

              Fig. 6.12 Attack Success Rate (ASR) for different models. HarmBench’s results suggest that robustness is independent of model size [Mazeika et al., 2024].

              HarmBench can be used by LLM developers to proactively identify and address potential vulnerabilities in their models before deployment. By automating the red teaming process, HarmBench allows for more efficient and scalable evaluation of LLM safety, enabling developers to test their models against a wider range of adversarial scenarios. This helps improve the robustness of LLMs and reduce the risk of malicious use.

        -

        6.6.1.4. SafeBench

        -

        SafeBench [ML Safety Team, 2024] is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.

        +

        6.6.1.4. SafeBench

        +

        SafeBench [ML Safety Team, 2024] is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.

        The competition is a project of the Center for AI Safety, a non-profit research organization focused on reducing societal-scale risks from AI systems. The organization has previously developed benchmarks such as MMLU, the Weapons of Mass Destruction Proxy, and the out-of-distribution detection baseline.

        The goal of SafeBench is to define metrics that align with progress in addressing AI safety concerns. This is driven by the understanding that metrics play a crucial role in the field of machine learning (ML). Formalizing these metrics into benchmarks is essential for evaluating and predicting potential risks posed by AI models.

        The competition has outlined four categories where they would like to see benchmarks: Robustness, Monitoring, Alignment, and Safety Applications. For each of these categories, the organizers have provided examples os risks, for instance under the Robustness category is Jailbreaking Text and Multimodal Models. This focuses on improving defenses against adversarial attacks. A submitted benchmark then could tackle new and ideally unseen jailbreaking attacks and defenses.

        -

        6.6.2. Tools & Techniques

        +

        6.6.2. Tools & Techniques

        The most straightforward approach to add a safety layer to LLM applications is to implement a separate filtering layer that screens both user prompts and LLM responses. Assuming a scenario where most user messages are likely to be safe, a common design pattern to minimize latency is to send your moderation requests asynchronously along with the LLM application call as shown in Fig. 6.13.

        Safety Layer @@ -1211,8 +1211,8 @@

        -

        6.6.2.1. Rules-Based Safety Filtering

        -

        Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 6.2.

        +

        6.6.2.1. Rules-Based Safety Filtering

        +

        Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 6.2.

    Table 8.5 LM Studio vs Jan vs OpenWebUI Comparison
    @@ -1273,13 +1273,13 @@

    -

    6.6.2.2. LLM-Based Safety Filtering

    +

    6.6.2.2. LLM-Based Safety Filtering

    Alternatively, an LLM-based component can be used as a content filter. Here, we observe three types os approaches: 1. Moderation API, 2. Fine-Tuned Open Source Models, and 3. Custom Moderation.

    Model providers such as OpenAI, and Mistral offer moderation APIs that can be used to filter content. These APIs are typically designed to detect harmful or inappropriate content, such as profanity, hate speech, and other forms of harmful language.

    -

    Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

    +

    Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

    # Mistral's Moderation API - Raw Text
     import os
     from mistralai import Mistral
    @@ -1315,7 +1315,7 @@ 

    print(response)

    -

    OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

    +

    OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

    from dotenv import load_dotenv
    @@ -1388,7 +1388,7 @@ 

    [Inan et al., 2023] is an implementation based on the risk categories as defined by the ML Commons consortium we introduced earlier. Three models have been released in its v3 iteration, in two classes:

    +

    Llama Guard model family [Inan et al., 2023] is an implementation based on the risk categories as defined by the ML Commons consortium we introduced earlier. Three models have been released in its v3 iteration, in two classes:

    1. Llama Guard 3 1B, Llama Guard 3 8B for text only processing and

    2. Llama Guard 3 11B-Vision for vision understanding

    3. @@ -1464,22 +1464,22 @@

      [Padhi et al., 2024] is a new competitor to Llama Guard family. It is a collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas [IBM, 2024]. The collection comprises two classes of models:

      +

      IBM Granite Guardian [Padhi et al., 2024] is a new competitor to Llama Guard family. It is a collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas [IBM, 2024]. The collection comprises two classes of models:

      1. Granite-Guardian-3.0-2B and Granite-Guardian-3.0-8B for detecting different forms of harmful content

      2. Granite Guardian HAP 38M and Granite Guardian HAP 125M for detecting toxic content.

      -

      In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 6.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

      +

      In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 6.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

      IBM Granite Guardian performance for the "Harm" risk dimension.
      -

      Fig. 6.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

      +

      Fig. 6.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

      The industry is increasingly focusing on the fine-tuning of pre-trained base models targeting a specific dimension of requirements and standards, here Safety being a critical one. This trend encompasses the release of open-source, fine-tuned safety models that can act as protective guardrails for LLM applications, as exemplified by LLaMa-Guard and IBM Granite Guardian. Additionally, there is a notable rise in models fine-tuned through techniques such as Reinforcement Learning from Human Feedback (RLHF), utilizing human preference datasets that incorporate safety considerations. These specialized models can function as safety filters as discussed but also as main models that alone could accomplished their original intended task safely without the need of external filters. We will cover this specific topic in the Chapter Preference-Based Alignment, where we will explore the process of aligning language models with human preferences ultimately leading to the development of an open source fine-tuned model that complies with user provided policy-based requirements.

      -

      6.6.2.3. Custom Moderation

      +

      6.6.2.3. Custom Moderation

      Custom moderation offers a tailored content filtering approach, enabling adherence to your own specific standards. As we have seen, each filtering-based approach we have discussed, while each having their own strengths, they all implement safety according to a pre-defined set of requirements or standards. Custom moderation, on the other hand, provides greater control compared to general moderation APIs or fine-tuned open source models though it requires more setup and maintenance.

      A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in Fig. 6.15. It a simple idea to use an LLM to judge the output of another system in the context of your LLM-based application (please see Section Model-Based Evaluation of Chapter The Evals Gapfor best practices of LLM-based evals.)

      @@ -1553,17 +1553,17 @@

      -

      6.7. Case Study: Implementing a Safety Filter

      +

      6.7. Case Study: Implementing a Safety Filter

      We will implement a basic safety filter for a K-12 application that will be used to filter content in a chat interface. The application will be designed to be used in a classroom setting where students and teachers can interact with the model to ask questions and receive answers. The safety filter will be designed to filter out harmful content such as profanity, hate speech, and other inappropriate content.

      In this stylized case study, we will limit our scope to the implementation of a safety filter for user prompts. We will not cover the implementation of the application itself or filtering the model’s output but rather focus on the user prompt safety filter. In real-world applications, an input policy would be paramount to better define what safety means before we identify associated risks and consecutive implementation decisions. Here, we will start with the design of the evals dataset (as we will see in a moment, skipping policy will lead to trouble later in the case study!)

      -

      6.7.1. Evals Dataset

      +

      6.7.1. Evals Dataset

      Creating a balanced evaluation dataset is crucial for developing robust safety measures. The dataset should be a well balanced set of “good” and “bad” samples to avoid biasing the model’s behavior in either direction.

      For this evaluation, we will create a dataset with NUM_SAMPLES examples, evenly split between good and bad samples (GOOD_SAMPLES and BAD_SAMPLES, respectively).

      -

      The good samples will be sourced from the UltraFeedback Binarized dataset [H4, 2024z], which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.

      +

      The good samples will be sourced from the UltraFeedback Binarized dataset [H4, 2024z], which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.

      The bad samples will come from two sources:

        -
      1. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

      2. +
      3. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

      4. Prompts sourced from Salad-Bench - These represent more subtle forms of harmful content like scams, harassment, or dangerous instructions, hence not necessarily mentioning an inappropriate keywords but rather a potentially harmful instruction.

      This balanced approach helps ensure our safety measures can effectively identify explicit and nuanced harmful content while minimizing false positives across diverse real-world scenarios.

      @@ -1576,7 +1576,7 @@

      -

      6.7.1.1. Bad Samples

      +

      6.7.1.1. Bad Samples

      def get_profanity_samples(num_samples, show_stats=True):
      @@ -1718,7 +1718,7 @@ 

      -

      6.7.1.2. Good Samples

      +

      6.7.1.2. Good Samples

      def get_good_samples(num_samples):
      @@ -1899,7 +1899,7 @@ 

      -

      6.7.2. Safety Filters

      +

      6.7.2. Safety Filters

      We will implement four safety filters, one for each of the following:

      1. LLM-Guard

      2. @@ -1964,7 +1964,7 @@

        -

        6.7.2.1. LLM-Guard

        +

        6.7.2.1. LLM-Guard

        Next, we implement a concrete validator using LLM Guard. The LLMGuardValidator class combines two key scanners:

        • BanTopics: Flags content containing banned topics

        • @@ -2057,7 +2057,7 @@

          -

          6.7.2.2. Mistral Moderation API

          +

          6.7.2.2. Mistral Moderation API

          You will need a Mistral API key to use the Mistral Moderation API. You can get one by signing up for a Mistral account and creating an API key, which we will assume is stored in a local .env file under the MISTRAL_API_KEY variable.

          The MistralValidator class implements a safety validator using Mistral’s moderation API. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on Mistral moderation categories. Example:

          {'sexual': False,
          @@ -2137,7 +2137,7 @@ 

          -

          6.7.2.3. OpenAI Moderation API

          +

          6.7.2.3. OpenAI Moderation API

          We implement a third safety filter using OpenAI’s Moderation API we had previously introduced.

          @@ -2202,7 +2202,7 @@

          -

          6.7.2.4. Custom Judge Validator

          +

          6.7.2.4. Custom Judge Validator

          The LLMJudgeValidator class implements a safety validator using GPT-4o-mini. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on the prompt we previously introduced in Section Custom Moderation.

          @@ -2287,7 +2287,7 @@

          -

          6.7.3. Benchmarking

          +

          6.7.3. Benchmarking

          We are ready to run our four safety filters against our dataset. We will each validator against 3 variations of our benchmark dataset:

          1. profanity-ultrafeedback: Using profanity dataset only for bad words together with ultrafeedback for good words

          2. @@ -2783,7 +2783,7 @@

            6.7.4. Takeaways

            +

            6.7.4. Takeaways

            • Safety is a complex problem and there is no one-size-fits-all solution.

            • Starting with a well-aligned policy is key to developing a robust data and evaluation framework.

            • @@ -2793,7 +2793,7 @@

              -

              6.8. Conclusion

              +

              6.8. Conclusion

              The rapid advancement of large language models has created an unsettling paradox: the same technologies that promise to revolutionize human-AI interaction also harbor significant risks that could undermine the very societies they aim to benefit. Our examination of various safety measures reveals that each approach has specific strengths and limitations when implemented in practice. However, instead of waiting for governments, organizations, and the public to catch up, we need to take action now.

              The case study on safety filters demonstrated the complexity of implementing even basic safety measures in real-world applications. What appears safe in one context may be inappropriate in another, and our current methods of safety evaluation often struggle with these nuances. The challenge of developing robust safety measures is further complicated by the potential for feedback loops in the training process - when models are fine-tuned on datasets that may contain hidden biases or problematic content.

              The path forward requires combining technical innovation with practical domain-specific wisdom. Safety in GenAI isn’t just a technical problem to be solved - it’s a mirror reflecting our own values, biases, and aspirations back at us. The growing focus on safety across the AI community, from open-source initiatives to corporate governance frameworks, provides a foundation for developing more robust safety measures. However, technologists working in isolation cannot solve these challenges - and may even perpetuate them unknowingly. Instead, domain experts across different verticals must come together to collaboratively define what safety means in the context of their specific users and broader society working in collaboration with the AI community.

              @@ -2811,233 +2811,233 @@

              -

              6.9. References

              +

              6.9. References

              -
              +
              [ASA24] (1,2)

              Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.

              -
              +
              [ABC+23] (1,2)

              Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.

              -
              +
              [BHY+24]

              Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842–845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.

              -
              +
              [BBC+24] (1,2)

              Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.

              -
              +
              [BMC+24] (1,2)

              Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.

              -
              +
              [CMM+24]

              Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.

              -
              +
              [Edg24] (1,2)

              Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.

              -
              +
              [Exa24] (1,2)

              Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.

              -
              +
              [GRB+24]

              Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.

              -
              +
              [H44z] -

              Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

              +

              HuggingFace H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

              -
              +
              [HGP+22]

              Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.

              -
              +
              [HYM+24] (1,2)

              Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.

              -
              +
              [IUC+23]

              Hakan Inan, Kartikeya Upasani, Jianfeng Chi, Rashi Rungta, Krithika Iyer, Yuning Mao, Michael Tontchev, Qing Hu, Brian Fuller, Davide Testuggine, and Madian Khabsa. Llama guard: llm-based input-output safeguard for human-ai conversations. 2023. URL: https://arxiv.org/abs/2312.06674, arXiv:2312.06674.

              -
              +
              [LDW+24] (1,2)

              Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.

              -
              +
              [LHE22] (1,2)

              Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

              -
              +
              [MPY+24] (1,2)

              Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.

              -
              +
              [MA24]

              Meta-AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.

              -
              +
              [MLC24]

              MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.

              -
              +
              [OAA+24]

              OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.

              -
              +
              [PNC+24] (1,2,3)

              Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Zahra Ashktorab, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.

              -
              +
              [PCZ+23]

              Alexander Pan, Jun Shern Chan, Andy Zou, Nathaniel Li, Steven Basart, Thomas Woodside, Jonathan Ng, Hanlin Zhang, Scott Emmons, and Dan Hendrycks. Do the rewards justify the means? measuring trade-offs between rewards and ethical behavior in the machiavelli benchmark. 2023. URL: https://arxiv.org/abs/2304.03279, arXiv:2304.03279.

              -
              +
              [PHS+22] (1,2)

              Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.

              -
              -[SJLS22] +
              +[SJLS22]

              Lingfeng Shen, Haiyun Jiang, Lemao Liu, and Shuming Shi. Rethink the evaluation for attack strength of backdoor attacks in natural language processing. 2022. URL: https://arxiv.org/abs/2201.02993, arXiv:2201.02993.

              -
              +
              [SZW+24]

              Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.

              -
              +
              [VAA+24] (1,2)

              Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.

              -
              +
              [VSK+24] (1,2)

              Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.

              -
              +
              [WMR24]

              Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.

              -
              +
              [WCP+24]

              Boxin Wang, Weixin Chen, Hengzhi Pei, Chulin Xie, Mintong Kang, Chenhui Zhang, Chejian Xu, Zidi Xiong, Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin, Yu Cheng, Sanmi Koyejo, Dawn Song, and Bo Li. Decodingtrust: a comprehensive assessment of trustworthiness in gpt models. 2024. URL: https://arxiv.org/abs/2306.11698, arXiv:2306.11698.

              -
              +
              [YLX24]

              Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.

              -
              +
              [ZYY+24]

              Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. "ghost of the past": identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.

              -
              +
              [Zho24] -

              Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.

              +

              Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. HuggingFace Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.

              -
              +
              [AmazonWServices24]

              Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.

              -
              +
              [Anthropic24]

              Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.

              -
              +
              [CenterfASafety24a]

              Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.

              -
              +
              [CenterfASafety24b]

              Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.

              -
              +
              [DeepMind24] (1,2)

              DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.

              -
              +
              [EuropeanMAgency24]

              European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.

              -
              +
              [FinancialIRAuthority24]

              Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.

              -
              -[HarmBench24] +
              +[HarmBench24]

              HarmBench. Harmbench explorer. 2024. URL: https://www.harmbench.org/explore.

              -
              +
              [IBM24]

              IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.

              -
              +
              [LibraryoCongress23]

              Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.

              -
              +
              [MistralAI24]

              Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.

              -
              +
              [MLSTeam24]

              ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.

              -
              +
              [NationalIoSaTechnology24]

              National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.

              -
              +
              [NVIDIA24]

              NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.

              -
              +
              [OpenAI24a]

              OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.

              -
              +
              [OpenAI24b] (1,2)

              OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.

              -
              +
              [OpenSafetyLab24a] -

              OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.

              +

              OpenSafetyLab. Salad-bench leaderboard. HuggingFace Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.

              -
              +
              [OpenSafetyLab24b] -

              OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.

              +

              OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. HuggingFace Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.

              -
              +
              [ProtectAI24]

              ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.

              -
              +
              [SurgeAI24]

              Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.

              -
              +
              [UKGovernment24]

              UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper.

              -
              +
              [UNICEF24]

              UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.

              @@ -3051,11 +3051,11 @@

              [2] -

              Attack Success Rate (ASR) refers to a metric used in cybersecurity and machine learning to measure the percentage of times an attack successfully achieves its intended outcome, essentially indicating how effective a particular attack method is against a system or model; it is calculated by dividing the number of successful attacks by the total number of attempted attacks [Shen et al., 2022].

              +

              Attack Success Rate (ASR) refers to a metric used in cybersecurity and machine learning to measure the percentage of times an attack successfully achieves its intended outcome, essentially indicating how effective a particular attack method is against a system or model; it is calculated by dividing the number of successful attacks by the total number of attempted attacks [Shen et al., 2022].

      diff --git a/tamingllms/_build/html/notebooks/structured_output.html b/tamingllms/_build/html/notebooks/structured_output.html index 7369ab7..4974a8d 100644 --- a/tamingllms/_build/html/notebooks/structured_output.html +++ b/tamingllms/_build/html/notebooks/structured_output.html @@ -256,7 +256,7 @@
      -

      4. Structured Output

      +

      4. Structured Output

      In limits, there is freedom. Creativity thrives within structure.

      —Julia B. Cameron

      @@ -264,42 +264,42 @@
      -

      4.1. Introduction

      +

      4.1. Introduction

      Language Models excel at generating human-like text, but they often struggle to produce output in a structured format, consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by downstream systems, such as databases, APIs, or other software applications. Even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data types and formats.

      -

      What user needs drive the demand for LLM output constraints? In a recent work by Google Research [Liu et al., 2024], the authors explored the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. User needs can be broadly categorized as follows:

      +

      What user needs drive the demand for LLM output constraints? In a recent work by Google Research [Liu et al., 2024], the authors explored the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. User needs can be broadly categorized as follows:

      1. Improving Developer Efficiency and Workflow

      • Reducing Trial and Error in Prompt Engineering: Developers find the process of crafting prompts to elicit desired output formats to be time-consuming, often involving extensive testing and iteration. LLM output constraints could make this process more efficient and predictable.

      • @@ -321,13 +321,13 @@

        -

        4.2. Problem Statement

        +

        4.2. Problem Statement

        Language models based on the Transformer architecture are next token prediction machines. These models calculate the probability of observing a token (from a vocabulary of size \(n\)) conditioned on the previous tokens in the sequence. This process can be expressed mathematically as:

        \[P(X) = P(x_1, x_2, \ldots, x_n) = \prod_{i=1}^n p(x_i|x_{<i})\]

        where, \(x_i\) represents the current token being generated, while \(x_{<i}\) encompasses all preceding tokens.

        -

        However, in practical applications, generating high-quality content requires more than just probabilistic next-token generation. The key challenge lies in incorporating control conditions (\(C\)) that guide the model to produce text with specific desired characteristics - whether that’s maintaining a consistent format, following syntactic rules, or adhering to semantic constraints. These control conditions must be integrated while preserving the model’s ability to generate natural, coherent text. This controlled text generation process can be formalized as [Liang et al., 2024]:

        +

        However, in practical applications, generating high-quality content requires more than just probabilistic next-token generation. The key challenge lies in incorporating control conditions (\(C\)) that guide the model to produce text with specific desired characteristics - whether that’s maintaining a consistent format, following syntactic rules, or adhering to semantic constraints. These control conditions must be integrated while preserving the model’s ability to generate natural, coherent text. This controlled text generation process can be formalized as [Liang et al., 2024]:

        \[P(X|C) = P(x_1, x_2, \ldots, x_n|C) = \prod_{i=1}^n p(x_i|x_{<i}, C)\]

        Here, \(C\) represents the set of constraints or control conditions that shape the generated output. Common constraints (\(C\)) include:

        @@ -341,8 +341,8 @@

        -

        4.3. Techniques

        -

        There are many techniques to obtain structured output from LLMs [Liang et al., 2024]. They can be broadly categorized into two types based on the phase they are applied to:

        +

        4.3. Techniques

        +

        There are many techniques to obtain structured output from LLMs [Liang et al., 2024]. They can be broadly categorized into two types based on the phase they are applied to:

        1. Training-Time Techniques (TTT): These techniques are applied during the training or post-training phases of the LLM. They are used to guide the model to learn the specific patterns and structures that are required for the task at hand.

        2. Inference-Time Techniques (ITT): These techniques are applied during the inference phase of the LLM. They are used to guide the model to produce the desired output at inference time.

        3. @@ -354,17 +354,17 @@

          NousResearch, 2024], a model trained on a specific system prompt for Structured Outputs able to respond according to following user provided JSON schema.

          +
        4. Example: NousResearch/Hermes-2-Theta-Llama-3-8B [NousResearch, 2024], a model trained on a specific system prompt for Structured Outputs able to respond according to following user provided JSON schema.

    4. Logit Post-Processing (ITT): Logit post-processing is a technique that involves modifying the logits of the LLM’s output before it is converted into text.

        -
      • Example: Outlines [Outlines, 2024], a Python package that allows to guide the generation process introducing logit biases. We will explore this solution later.

      • +
      • Example: Outlines [Outlines, 2024], a Python package that allows to guide the generation process introducing logit biases. We will explore this solution later.

    5. -

      4.3.1. Prompt Engineering

      +

      4.3.1. Prompt Engineering

      Perhaps the most common strategy to generate LLM response in a target format is using prompt engineering, in particular one-shot prompting, where the user provides an example of the desired output format within the prompt.

      As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies’ disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems.

      In a one-shot prompting fashion, we can pass the following example in the prompt:

      @@ -490,7 +490,7 @@

      -

      4.3.2. JSON Mode (Fine-Tuned)

      +

      4.3.2. JSON Mode (Fine-Tuned)

      One-shot prompting is a simple technique that can lead to low-effort improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model’s output needs to be restricted to a specific set of options or types.

      Some models offer so-called “JSON Mode” as an attempt to handle those challenges. This is a feature provided by most LLM API providers today, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in Fig. 4.1, JSON mode is implemented by instructing the LLM model to use JSON as response format and optionally defining a target schema.

      @@ -612,7 +612,7 @@

      -

      4.3.3. Logit Post-Processing

      +

      4.3.3. Logit Post-Processing

      Logit post-processing is a technique that involves modifying the logits of the LLM’s output before it is converted into text such that we have a “controlled” text generation.

      The text generation process follows a probabilistic approach. At each step, the model calculates the probability distribution over its entire vocabulary to determine the most likely next token.

      Let’s examine how an LLM processes an example prompt “Is Enzo a good name for a baby?” as depicted in Fig. 4.2:

      @@ -849,11 +849,11 @@

      -

      4.4. Tools

      +

      4.4. Tools

      -

      4.4.1. Outlines

      -

      Outlines [Outlines, 2024] is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model’s output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options.

      -

      The authors solve the general guided generation problem [Willard and Louf, 2023], which, as a consequence, solves the problem of structured output generation in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).

      +

      4.4.1. Outlines

      +

      Outlines [Outlines, 2024] is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model’s output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options.

      +

      The authors solve the general guided generation problem [Willard and Louf, 2023], which, as a consequence, solves the problem of structured output generation in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).

      They define the next token generation as a random variable:

      \[s_{t+1} \sim \text{Categorical}(\alpha) \text{ where } \alpha = \text{LLM}(S_t, \theta)\]
      @@ -884,7 +884,7 @@

      \(\tilde{s}_{t+1}\) is the next token sampled under constraints

      This formulation allows the masking operation to guide the generation process by zeroing out probabilities of invalid tokens according to the finite state machine states. But instead of checking the entire vocabulary (size N) at each generation step (O(N) complexity) to enforce output constraints, they convert constraints (regex/grammar) into FSM states and build an index mapping FSM states to valid vocabulary tokens. This achieves O(1) average complexity for token generation.

      -

      In summary, there are two stages in the Outlines framework [Tran-Thien, 2024]:

      +

      In summary, there are two stages in the Outlines framework [Tran-Thien, 2024]:

      1. Preprocessing Step: Outlines converts a character-level deterministic finite automaton (DFA) testing whether a string matches a regex into a token-level DFA testing whether a token sequence is decoded in a string matching the regex.

      2. Decoding Step: At decoding time, the DFA is used to determine, for each new token, which potential tokens are allowed. Starting from the initial state of the DFA, the allowed tokens are determined by the outgoing transitions from the current state. The corresponding mask is applied to the next token probabilities and these probabilities are renormalized. A new token can then be sampled and the state of the DFA updated.

      3. @@ -907,7 +907,7 @@

        Outlines State Machine
        -

        Fig. 4.3 Outlines State Machine [Tran-Thien, 2024].

        +

        Fig. 4.3 Outlines State Machine [Tran-Thien, 2024].

      The initial “Start” state contains a masking table that controls which tokens can begin the sequence. In this example, only characters from the set [YyNnAa] are allowed as valid first characters, with each having an assigned probability and mask value. The masking mechanism effectively filters out invalid tokens by setting their mask values to 0, ensuring only permitted transitions to the “First” state.

      @@ -997,7 +997,7 @@

      -

      4.4.2. LangChain

      +

      4.4.2. LangChain

      LangChain is a framework designed to simplify the development of LLM applications. It provides an abstraction layer over many LLM providers that in turn offers structured output.

      In particular, LangChain offers the with_structured_output method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.

      @@ -1052,12 +1052,12 @@

      Extracted places: ['California', 'Cupertino']

    -

    We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support .with_structured_output() can be found here. You can also use Outlines with LangChain [LangChain, 2024b].

    +

    We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support .with_structured_output() can be found here. You can also use Outlines with LangChain [LangChain, 2024b].

    -

    4.4.3. Ollama

    +

    4.4.3. Ollama

    Ollama is a popular tool that allows you to run LLMs locally (see Chapter Local LLMs in Practice). Ollama first introduced structured output generation in version 0.5.1 in late 2024 providing support for JSON output but highlighting additional formats are coming soon.

    -

    The current ollama implementation leverages LLama.cpp GBNF (GGML BNF) grammars [Ggerganov, 2024] to enable structured output generation. LLama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It’s essentially an extension of BNF (Backus-Naur Form) [Wikipedia contributors, 2024] with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model’s output strictly adheres to the desired format.

    +

    The current ollama implementation leverages LLama.cpp GBNF (GGML BNF) grammars [Ggerganov, 2024] to enable structured output generation. LLama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It’s essentially an extension of BNF (Backus-Naur Form) [Wikipedia contributors, 2024] with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model’s output strictly adheres to the desired format.

    Let’s replicate our previous structured output generation example with Ollama. First, make sure you have Ollama installed. You can find installation instructions here.

    curl -fsSL https://ollama.com/install.sh | sh
     pip install ollama
    @@ -1153,9 +1153,9 @@ 

    -

    4.5. Discussion

    +

    4.5. Discussion

    -

    4.5.1. Best Practices

    +

    4.5.1. Best Practices

    When implementing structured output with LLMs, it’s crucial to understand the distinction between different approaches. Some methods, such as logit post-processing, provide mathematical guarantees that the output will conform to the specified structure. This contrasts sharply with approaches like JSON mode, which rely on fine-tuned models or prompt engineering that offer no formal guarantees. This distinction becomes particularly important in production environments where reliability and consistency are paramount. With that in mind, here are some best practices to consider when implementing structured output generation with LLMs:

    • Clear Schema Definition: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate.

    • @@ -1165,7 +1165,7 @@

      -

      4.5.2. Comparing Solutions

      +

      4.5.2. Comparing Solutions

      The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output generation depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution that provides formal guarantees with great flexibility and control over structured output generation while providing support for a wide range of LLMs. Table 4.1 provides a summary comparison of the different solutions.

    Table 6.2 Rules-Based Safety Filtering Tools.
    @@ -1204,26 +1204,26 @@

    [Guidance AI, 2024] and NVIDIA’s Logits Processor Zoo [NVIDIA, 2024a].

    +

    Other related tools not covered in this chapter worth mentioning include Guidance [Guidance AI, 2024] and NVIDIA’s Logits Processor Zoo [NVIDIA, 2024a].

    -

    4.5.3. Research and Ongoing Debate

    +

    4.5.3. Research and Ongoing Debate

    The use of structured output for Large Language Models is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.

    -

    There is some evidence indicating that LLMs may have bias in their handling of different output formats [Long et al., 2024]. This study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats. The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.

    -

    Recent (not yet peer-reviewed) research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” [Tam et al., 2024] suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence [Aider, 2024] suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:

    +

    There is some evidence indicating that LLMs may have bias in their handling of different output formats [Long et al., 2024]. This study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats. The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.

    +

    Recent (not yet peer-reviewed) research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” [Tam et al., 2024] suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence [Aider, 2024] suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:

    • Potential performance degradation: Enforcing structured output, especially through constrained decoding methods like JSON-mode, can negatively impact an LLM’s reasoning abilities. This is particularly evident in tasks that require multi-step reasoning or complex thought processes.

    • Overly restrictive schemas: Imposing strict schemas can limit the expressiveness of LLM outputs and may hinder their ability to generate creative or nuanced responses. In certain cases, the strictness of the schema might outweigh the benefits of structured output.

    • Increased complexity in prompt engineering: Crafting prompts that effectively guide LLMs to generate structured outputs while maintaining performance can be challenging. It often requires careful consideration of the schema, the task instructions, and the desired level of detail in the response.

    -

    On the other hand, those findings are not without criticism. The .txt team challenges the work of [Tam et al., 2024]. The rebuttal argues that structured generation, when done correctly, actually improves performance [Dottxt, 2024].

    +

    On the other hand, those findings are not without criticism. The .txt team challenges the work of [Tam et al., 2024]. The rebuttal argues that structured generation, when done correctly, actually improves performance [Dottxt, 2024].

    Structured vs Unstructured Results by .txt team
    -

    Fig. 4.4 Structured vs Unstructured Results by .txt team [Dottxt, 2024].

    +

    Fig. 4.4 Structured vs Unstructured Results by .txt team [Dottxt, 2024].

    -

    The .txt team presents compelling evidence through their reproduction of the paper’s experiments. While their unstructured results align with the original paper’s findings, their structured results paint a dramatically different picture - demonstrating that structured generation actually improves performance (see Fig. 4.4). The team has made their experimental notebooks publicly available on GitHub for independent verification [Dottxt, 2024].

    +

    The .txt team presents compelling evidence through their reproduction of the paper’s experiments. While their unstructured results align with the original paper’s findings, their structured results paint a dramatically different picture - demonstrating that structured generation actually improves performance (see Fig. 4.4). The team has made their experimental notebooks publicly available on GitHub for independent verification [Dottxt, 2024].

    .txt team identifies several flaws in the methodology of “Let Me Speak Freely?” that they believe led to inaccurate conclusions:

    • The paper finds that structured output improves performance on classification tasks but doesn’t reconcile this finding with its overall negative conclusion about structured output.

    • @@ -1237,12 +1237,12 @@

      -

      4.6. Conclusion

      +

      4.6. Conclusion

      Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights.

      Prompt engineering and the use of fine-tuned models can help control the output of LLMs. However, when strong guarantees are needed, practitioners should consider techniques such as logit post-processing that provides formal guarantees for controlled output generation.

    -

    4.7. Acknowledgements

    +

    4.7. Acknowledgements

    We would like to thank Cameron Pfiffer from the .txt team for his insightful review and feedback.

    CC BY-NC-SA 4.0

    @misc{tharsistpsouza2024tamingllms,
    @@ -1257,70 +1257,70 @@ 

    -

    4.8. References

    +

    4.8. References

    -
    +
    [Aid24]

    Aider. Code in json: structured output for llms. https://aider.chat/2024/08/14/code-in-json.html, 2024. Accessed: 2024.

    -
    +
    [Dot24] (1,2,3)

    Dottxt. Say what you mean: demos. https://github.com/dottxt-ai/demos/tree/main/say-what-you-mean, 2024. Accessed: 2024.

    -
    +
    [Gge24]

    Ggerganov. Llama.cpp grammars documentation. https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md, 2024. Accessed: 2024.

    -
    +
    [Lan4b]

    LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: https://python.langchain.com/docs/integrations/chat/outlines/.

    -
    +
    [LWW+24] (1,2)

    Xun Liang, Hanyu Wang, Yezhaohui Wang, Shichao Song, Jiawei Yang, Simin Niu, Jie Hu, Dan Liu, Shunyu Yao, Feiyu Xiong, and Zhiyu Li. Controllable text generation for large language models: a survey. 2024. URL: https://arxiv.org/abs/2408.12599, arXiv:2408.12599.

    -
    +
    [LLF+24]

    Michael Xieyang Liu, Frederick Liu, Alexander J. Fiannaca, Terry Koo, Lucas Dixon, Michael Terry, and Carrie J. Cai. "we need structured output": towards user-centered constraints on large language model output. In Extended Abstracts of the CHI Conference on Human Factors in Computing Systems, CHI EA '24. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3613905.3650756, doi:10.1145/3613905.3650756.

    -
    +
    [LNS+24]

    Do Xuan Long, Hai Nguyen Ngoc, Tiviatis Sim, Hieu Dao, Shafiq Joty, Kenji Kawaguchi, Nancy F Chen, and Min-Yen Kan. Llms are biased towards output formats! systematically evaluating and mitigating output format bias of llms. arXiv preprint arXiv:2408.08656, 2024.

    -
    +
    [Nou24]

    NousResearch. Hermes-2-theta-llama-3-8b. https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B, 2024. Accessed: 2024.

    -
    +
    [Out24] (1,2)

    Outlines. Type-safe structured output from llms. https://dottxt-ai.github.io/outlines/latest/, 2024. Accessed: 2024.

    -
    +
    [TWT+24] (1,2)

    Zhi Rui Tam, Cheng-Kuang Wu, Yi-Lin Tsai, Chieh-Yen Lin, Hung-yi Lee, and Yun-Nung Chen. Let me speak freely? a study on the impact of format restrictions on performance of large language models. 2024. URL: https://arxiv.org/abs/2408.02442, arXiv:2408.02442.

    -
    +
    [TT24] (1,2)

    Vivien Tran-Thien. Fast, high-fidelity llm decoding with regex constraints. 2024. URL: https://vivien000.github.io/blog/journal/llm-decoding-with-regex-constraints.html.

    -
    +
    [WL23]

    Brandon T. Willard and Rémi Louf. Efficient guided generation for large language models. 2023. URL: https://arxiv.org/abs/2307.09702, arXiv:2307.09702.

    -
    +
    [GuidanceAI24]

    Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: https://github.com/guidance-ai/guidance.

    -
    +
    [NVIDIA4a]

    NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: https://github.com/NVIDIA/logits-processor-zoo.

    -
    +
    [Wikipediacontributors24]

    Wikipedia contributors. Backus naur form. https://en.wiktionary.org/wiki/Backus-Naur_form, 2024. Accessed: 2024.

    diff --git a/tamingllms/_build/html/objects.inv b/tamingllms/_build/html/objects.inv index 2d5d56c..2a97490 100644 Binary files a/tamingllms/_build/html/objects.inv and b/tamingllms/_build/html/objects.inv differ diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js index 2656419..7ff31ae 100644 --- a/tamingllms/_build/html/searchindex.js +++ b/tamingllms/_build/html/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/input", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/input.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["2. About the Book", "1. Preface", "Taming LLMs", "7. Preference-Based Alignment", "9. The Falling Cost Paradox", "3. The Evals Gap", "5. Managing Input Data", "8. Local LLMs in Practice", "6. Safety", "4. Structured Output"], "terms": {"am": [0, 8], "alwai": [0, 3, 4, 5, 6, 9], "do": [0, 3, 4, 5, 6, 7, 8, 9], "which": [0, 3, 4, 5, 6, 7, 8, 9], "cannot": [0, 3, 4, 5, 7, 8], "order": [0, 3, 5, 6, 8, 9], "mai": [0, 1, 3, 4, 5, 6, 7, 8, 9], "learn": [0, 3, 5, 6, 7, 8, 9], "how": [0, 1, 3, 4, 5, 6, 7, 8, 9], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8, 9], "recent": [0, 3, 4, 5, 6, 7, 8, 9], "year": [0, 3, 4, 5, 6, 7, 8, 9], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "languag": [0, 1, 2, 4, 5, 6, 7, 8, 9], "model": [0, 1, 2, 4, 6, 8, 9], "llm": [0, 1, 3, 6, 9], "have": [0, 1, 3, 4, 5, 6, 7, 8, 9], "emerg": [0, 3, 4, 6, 7, 8, 9], "transform": [0, 1, 3, 5, 6, 7, 8, 9], "forc": [0, 5, 9], "technologi": [0, 1, 4, 5, 6, 7, 8], "promis": [0, 3, 4, 5, 8], "revolution": [0, 8], "build": [0, 2, 3, 5, 6, 7, 8, 9], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "interact": [0, 3, 4, 5, 6, 7, 8, 9], "comput": [0, 3, 4, 5, 6, 7, 8, 9], "from": [0, 1, 4, 5, 6, 7, 8, 9], "chatgpt": [0, 3, 4, 7, 9], "llama": [0, 3, 4, 5, 8, 9], "github": [0, 3, 4, 5, 7, 8, 9], "copilot": 0, "claud": [0, 3, 5, 7, 8], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8, 9], "captur": [0, 1, 3, 5, 7, 8], "public": [0, 3, 5, 7, 8], "imagin": [0, 7], "spark": 0, "gold": [0, 3, 6, 8], "rush": 0, "ai": [0, 3, 4, 5, 7, 9], "power": [0, 2, 3, 4, 5, 7, 8, 9], "applic": [0, 1, 2, 3, 4, 6, 7, 8, 9], "howev": [0, 3, 4, 5, 6, 7, 8, 9], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 6, 8], "revolut": [0, 4], "li": [0, 3, 5, 7, 8, 9], "complex": [0, 1, 3, 5, 6, 7, 8, 9], "landscap": [0, 3, 5, 7], "softwar": [0, 1, 3, 4, 6, 7, 8, 9], "develop": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tech": [0, 7, 8], "leader": [0, 2, 5, 8], "must": [0, 3, 4, 5, 7, 8, 9], "navig": [0, 2, 5, 7, 8], "focus": [0, 3, 4, 5, 6, 7, 8, 9], "bring": [0, 3, 7], "awar": [0, 3, 4, 5, 6, 8], "limit": [0, 1, 2, 4, 5, 6, 7, 8, 9], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7, 8], "overcom": [0, 5, 6], "them": [0, 1, 3, 4, 5, 6, 7, 8, 9], "robust": [0, 3, 4, 5, 6, 7, 8, 9], "It": [0, 3, 4, 5, 6, 7, 8, 9], "offer": [0, 3, 4, 5, 6, 7, 8, 9], "critic": [0, 2, 3, 4, 5, 6, 7, 8, 9], "implement": [0, 2, 3, 4, 5, 7, 9], "back": [0, 5, 6, 7, 8, 9], "reproduc": [0, 1, 2, 5, 7], "exampl": [0, 1, 2, 3, 5, 7, 8, 9], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "mani": [0, 1, 3, 4, 5, 6, 7, 8, 9], "resourc": [0, 3, 4, 5, 6, 7, 8], "cover": [0, 3, 4, 5, 6, 7, 8, 9], "capabl": [0, 1, 2, 4, 5, 6, 7, 8, 9], "specif": [0, 3, 4, 5, 6, 7, 9], "hidden": [0, 3, 8], "pitfal": [0, 1, 3, 4, 5, 6, 7, 9], "engin": [0, 1, 2, 3, 4, 5, 7, 8], "technic": [0, 1, 2, 3, 5, 7, 9], "face": [0, 3, 4, 5, 6, 7, 8], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8, 9], "guid": [0, 1, 3, 4, 5, 6, 7, 8, 9], "leverag": [0, 3, 5, 6, 7, 8, 9], "battl": [0, 2, 7], "test": [0, 2, 3, 4, 6, 7, 8, 9], "tool": [0, 1, 3, 4, 6], "throughout": [0, 4, 5, 6, 7, 8], "tackl": [0, 3, 5, 6, 8], "follow": [0, 3, 4, 5, 6, 7, 8, 9], "non": [0, 3, 7, 8, 9], "exhaust": [0, 7], "list": [0, 3, 5, 6, 7, 8, 9], "structur": [0, 3, 4, 5, 7, 8], "un": 0, "reliabl": [0, 1, 3, 4, 5, 7, 8, 9], "struggl": [0, 1, 3, 5, 6, 7, 8, 9], "maintain": [0, 1, 3, 4, 5, 6, 7, 8, 9], "consist": [0, 1, 3, 4, 5, 6, 7, 8, 9], "output": [0, 1, 3, 5, 6, 7, 8], "format": [0, 3, 4, 5, 6, 7, 8, 9], "complic": [0, 8], "integr": [0, 1, 3, 4, 5, 6, 7, 8, 9], "larger": [0, 3, 4, 5, 6, 7, 8, 9], "make": [0, 3, 4, 5, 6, 7, 8, 9], "error": [0, 3, 5, 6, 8, 9], "handl": [0, 3, 4, 5, 6, 7, 8, 9], "more": [0, 1, 3, 5, 6, 7, 8, 9], "input": [0, 3, 5, 7, 8, 9], "data": [0, 1, 4, 5, 7, 8, 9], "manag": [0, 1, 4, 5, 7, 8, 9], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9], "sensit": [0, 3, 4, 5, 6, 7, 8], "oper": [0, 3, 5, 6, 7, 8, 9], "stale": [0, 6], "long": [0, 1, 3, 4, 5, 7, 8, 9], "context": [0, 1, 3, 4, 5, 6, 7, 8, 9], "requir": [0, 3, 6, 7, 8, 9], "care": [0, 3, 4, 5, 6, 7, 8, 9], "retriev": [0, 4, 5, 7], "strategi": [0, 3, 4, 5, 6, 7, 8, 9], "tradit": [0, 3, 7, 8], "methodologi": [0, 3, 5, 7, 8, 9], "break": [0, 1, 3, 4, 5, 6, 8], "down": [0, 1, 4, 5, 6, 7, 8], "deal": [0, 3, 7], "determinist": [0, 9], "gener": [0, 1, 4, 7, 9], "new": [0, 2, 3, 4, 5, 6, 7, 8, 9], "safeti": [0, 3, 5, 9], "align": [0, 4, 5, 6, 7, 8, 9], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9], "harm": [0, 3, 5, 7], "bias": [0, 3, 5, 7, 8, 9], "inappropri": [0, 3, 8], "safeguard": [0, 5, 8], "monitor": [0, 3, 4, 5, 6, 7, 8], "ensur": [0, 3, 4, 5, 6, 7, 8, 9], "safe": [0, 3, 5, 8, 9], "deploy": [0, 3, 4, 5, 8, 9], "vendor": [0, 4, 5, 7], "lock": [0, 3, 4, 7], "cloud": [0, 3, 4, 5, 7, 8, 9], "base": [0, 1, 4, 7, 9], "provid": [0, 2, 3, 4, 5, 6, 7, 8, 9], "creat": [0, 1, 3, 4, 5, 6, 7, 8, 9], "signific": [0, 3, 4, 5, 6, 7, 8, 9], "depend": [0, 3, 4, 5, 6, 7, 9], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "proprietari": [0, 3, 7, 8, 9], "infrastructur": [0, 4, 7], "difficult": [0, 3, 5, 8], "switch": [0, 7], "self": [0, 3, 5, 6, 7, 8, 9], "host": [0, 4, 5, 7, 8], "cost": [0, 3, 5, 6, 8, 9], "optim": [0, 1, 5, 6, 7, 8], "The": [0, 1, 3, 6, 8, 9], "financi": [0, 1, 3, 4, 5, 6, 8, 9], "quickli": [0, 3, 4, 7], "becom": [0, 3, 4, 5, 6, 7, 8, 9], "prohibit": [0, 3, 5, 7], "without": [0, 1, 3, 4, 5, 7, 8, 9], "take": [0, 2, 3, 4, 5, 6, 7, 8, 9], "hand": [0, 7, 8, 9], "focu": [0, 2, 3, 4, 5, 6, 7, 8, 9], "access": [0, 3, 4, 5, 6, 7, 8, 9], "all": [0, 1, 3, 4, 5, 6, 7, 8, 9], "fulli": [0, 3, 5, 8], "document": [0, 3, 4, 5, 7, 8, 9], "allow": [0, 5, 6, 7, 8, 9], "reader": [0, 2, 6, 8], "replic": [0, 5, 8, 9], "result": [0, 3, 4, 5, 6, 8, 9], "exactli": [0, 5, 9], "design": [0, 1, 3, 7, 9], "run": [0, 3, 4, 5, 6, 7, 8, 9], "consum": [0, 3, 4, 5, 6, 7, 8, 9], "grade": [0, 3, 4, 5, 6, 7, 8], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7, 8], "avail": [0, 3, 4, 5, 6, 7, 8, 9], "notebook": [0, 3, 9], "modifi": [0, 3, 5, 8, 9], "extend": [0, 3, 4, 5, 6, 7, 9], "minim": [0, 3, 4, 5, 7, 8, 9], "effect": [0, 1, 3, 4, 5, 6, 8, 9], "framework": [0, 3, 4, 5, 7], "wai": [0, 3, 4, 5, 6, 7, 8, 9], "priorit": [0, 3, 5, 6, 7, 8], "transpar": [0, 3, 4, 5, 7, 8], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8, 9], "better": [0, 2, 3, 4, 5, 6, 7, 8, 9], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "custom": [0, 3, 5, 6, 9], "flexibl": [0, 4, 5, 7, 8, 9], "adapt": [0, 3, 4, 5, 7, 8], "us": [0, 1, 3, 4, 7, 8, 9], "case": [0, 4, 5, 9], "unlik": [0, 3, 5, 7], "black": [0, 3], "box": [0, 7], "commerci": [0, 5, 7, 8, 9], "most": [0, 3, 4, 5, 6, 7, 8, 9], "freeli": [0, 9], "foster": [0, 3, 5, 8, 9], "reduc": [0, 3, 4, 5, 6, 7, 8, 9], "independ": [0, 5, 6, 8, 9], "freedom": [0, 7, 9], "architectur": [0, 3, 4, 5, 7, 9], "decis": [0, 3, 4, 5, 6, 7, 8], "keep": [0, 3, 5, 6, 7, 8], "principl": [0, 3, 5, 7, 8], "itself": [0, 3, 5, 6, 7, 8], "": [0, 1, 3, 4, 5, 6, 7, 8, 9], "live": [0, 1, 5, 6, 8], "evolv": [0, 4, 5, 7, 8], "chang": [0, 3, 5, 6, 7, 8], "encourag": [0, 3, 5, 8, 9], "report": [0, 3, 5, 6, 7, 8, 9], "suggest": [0, 3, 5, 7, 8, 9], "improv": [0, 3, 4, 5, 6, 7, 8, 9], "contribut": [0, 4, 5, 6, 7, 8], "via": [0, 3, 4, 5, 6, 7, 8, 9], "pull": [0, 7], "request": [0, 3, 4, 5, 6, 7, 8, 9], "share": [0, 3, 5, 6, 7, 8, 9], "own": [0, 3, 4, 5, 6, 7, 8], "experi": [0, 3, 4, 5, 6, 7, 8, 9], "commun": [0, 3, 4, 5, 6, 8, 9], "propos": [0, 4, 5, 6, 8], "chapter": [0, 3, 4, 5, 6, 7, 8, 9], "section": [0, 3, 4, 5, 6, 7, 8, 9], "found": [0, 3, 4, 5, 7, 9], "http": [0, 1, 3, 4, 5, 6, 7, 8, 9], "com": [0, 3, 4, 5, 6, 7, 8, 9], "souzatharsi": [0, 3, 4, 5, 6, 7, 8, 9], "tamingllm": [0, 3, 4, 5, 6, 7, 8, 9], "whether": [0, 3, 4, 5, 6, 7, 8, 9], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9], "ve": [0, 7], "typo": [0, 8], "want": [0, 1, 3, 6, 7, 8, 9], "welcom": 0, "pleas": [0, 3, 5, 7, 8], "feel": [0, 7], "free": [0, 1, 3, 5, 7, 8], "look": [0, 2, 3, 4, 5, 6, 7, 8], "our": [0, 1, 3, 4, 5, 6, 7, 8, 9], "goal": [0, 1, 3, 5, 6, 8, 9], "discourag": [0, 6], "enabl": [0, 3, 4, 5, 6, 7, 8, 9], "By": [0, 1, 2, 3, 5, 6, 8, 9], "upfront": [0, 2, 4], "equip": [0, 2, 5, 8], "avoid": [0, 3, 5, 7, 8, 9], "current": [0, 2, 3, 4, 5, 6, 8, 9], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8, 9], "tend": [0, 2, 5, 8], "toward": [0, 3, 5, 8, 9], "extrem": [0, 3, 4, 5, 8], "either": [0, 3, 5, 6, 7, 8], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8, 9], "rather": [0, 1, 3, 4, 5, 6, 7, 8], "than": [0, 1, 3, 5, 6, 7, 8, 9], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8, 9], "first": [0, 1, 3, 4, 5, 6, 7, 8, 9], "everi": [0, 4, 5, 8], "concept": [0, 3, 5, 8], "illustr": [0, 3, 5, 6, 7, 8, 9], "execut": [0, 5, 7, 8], "immedi": [0, 3, 4, 5, 7], "analysi": [0, 1, 3, 4, 5, 6, 7, 8], "balanc": [0, 3, 4, 5, 7, 8, 9], "both": [0, 3, 4, 5, 6, 7, 8], "help": [0, 3, 4, 5, 6, 7, 8, 9], "inform": [0, 3, 4, 5, 6, 7, 8, 9], "lead": [0, 1, 3, 4, 5, 6, 7, 8, 9], "genai": [0, 1, 3, 6, 8], "initi": [0, 1, 3, 4, 5, 6, 7, 8, 9], "advoc": [0, 8], "anyon": [0, 8], "seek": [0, 5, 7, 8], "work": [0, 1, 3, 4, 5, 6, 7, 8, 9], "typic": [0, 3, 4, 5, 7, 8, 9], "job": [0, 5, 6, 7, 8], "role": [0, 3, 5, 6, 7, 8, 9], "platform": [0, 5, 6, 7, 8, 9], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 7], "ml": [0, 6, 8], "transit": [0, 4, 5, 7, 9], "overse": 0, "motiv": [0, 3, 4, 5, 6, 9], "need": [0, 3, 4, 5, 6, 7, 8, 9], "readi": [0, 5, 8], "desir": [0, 1, 3, 5, 9], "perform": [0, 3, 5, 6, 8, 9], "earli": [0, 3, 4, 5, 8, 9], "befor": [0, 3, 4, 5, 6, 8, 9], "thei": [0, 1, 3, 4, 5, 6, 7, 8, 9], "costli": [0, 5, 6, 8], "problem": [0, 1, 2, 3, 4, 6, 7, 8], "too": [0, 1, 3, 5, 6, 7, 8], "late": [0, 3, 4, 8, 9], "lifecycl": [0, 7, 8], "after": [0, 1, 3, 5, 6, 7, 8, 9], "read": [0, 3, 4, 5, 6, 8, 9], "implic": [0, 1, 3, 5, 8], "recommend": [0, 3, 5, 6, 7, 8, 9], "abl": [0, 3, 5, 9], "deploi": [0, 3, 5, 7, 8], "proper": [0, 3, 4, 7, 8, 9], "realist": [0, 3, 4, 8], "effort": [0, 5, 7, 8, 9], "estim": [0, 4, 5, 6, 8], "project": [0, 3, 4, 5, 6, 7, 8], "impact": [0, 3, 4, 5, 6, 7, 8, 9], "timelin": 0, "To": [0, 3, 5, 7, 8, 9], "should": [0, 3, 4, 5, 6, 7, 8, 9], "basic": [0, 3, 5, 7, 8], "program": [0, 5, 6, 7, 9], "knowledg": [0, 3, 5, 6, 7, 8], "mistral": [0, 3, 9], "openai": [0, 3, 5, 6, 7, 9], "anthrop": [0, 3, 6, 9], "similar": [0, 3, 4, 5, 6, 7, 9], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8, 9], "get": [0, 3, 4, 5, 6, 7, 8, 9], "start": [0, 3, 4, 5, 6, 7, 8, 9], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7, 8], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8, 9], "venv": [0, 9], "tame": [0, 3, 4, 5, 6, 7, 8, 9], "env": [0, 3, 5, 6, 8, 9], "bin": [0, 7], "On": [0, 5, 7, 9], "window": [0, 4, 5, 6, 7], "script": [0, 7], "try": [0, 1, 3, 5, 8, 9], "each": [0, 3, 4, 5, 6, 7, 8, 9], "contain": [0, 3, 4, 5, 6, 7, 8, 9], "possibl": [0, 3, 4, 5, 6, 7, 8, 9], "includ": [0, 1, 3, 4, 5, 6, 7, 8, 9], "necessari": [0, 3, 4, 5, 8], "instal": [0, 3, 5, 7, 9], "go": [0, 3, 5, 6, 9], "prefer": [0, 5, 6, 7, 8, 9], "packag": [0, 4, 5, 6, 7, 9], "e": [0, 1, 3, 4, 5, 6, 7, 8, 9], "g": [0, 3, 4, 5, 6, 7, 8, 9], "pip": [0, 3, 5, 7, 9], "poetri": [0, 8], "file": [0, 3, 5, 6, 7, 8, 9], "root": [0, 3], "directori": [0, 5, 7], "add": [0, 3, 5, 6, 7, 8], "other": [0, 3, 4, 5, 6, 7, 8, 9], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 9], "commit": [0, 3, 5, 8], "version": [0, 3, 4, 5, 7, 8, 9], "control": [0, 1, 3, 4, 5, 6, 7, 8, 9], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 7, 8, 9], "encount": [0, 2, 5, 8], "rate": [0, 3, 4, 5, 6, 7, 8], "consid": [0, 3, 4, 5, 6, 7, 8, 9], "smaller": [0, 3, 4, 5, 6, 7, 9], "retri": [0, 9], "logic": [0, 1, 3, 5, 6, 8], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8, 9], "check": [0, 5, 6, 7, 8, 9], "page": [0, 5, 6, 7], "known": [0, 5, 6, 8, 9], "now": [0, 1, 3, 4, 5, 6, 7, 8, 9], "let": [0, 3, 4, 5, 6, 7, 8, 9], "begin": [0, 5, 7, 8, 9], "explor": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tharsi": [0, 3, 4, 5, 6, 7, 8, 9], "souza": [0, 3, 4, 5, 6, 7, 8, 9], "ph": [0, 8], "d": [0, 3, 4, 5, 6, 7, 8, 9], "scienc": [0, 3, 5, 8], "ucl": 0, "univers": [0, 5, 7, 8], "london": 0, "scientist": [0, 1, 7, 8], "special": [0, 4, 5, 6, 7, 8, 9], "he": [0, 3, 5, 8], "lectur": 0, "columbia": 0, "master": [0, 4, 7, 9], "appli": [0, 3, 5, 6, 7, 8, 9], "analyt": 0, "incom": [0, 5, 6], "head": [0, 3, 5, 6, 8, 9], "equiti": [0, 5, 6], "citadel": 0, "former": [0, 1, 5, 7], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8, 9], "sigma": [0, 3], "invest": [0, 3, 4, 5, 6, 8], "mentor": 0, "under": [0, 3, 4, 5, 7, 8, 9], "repres": [0, 3, 4, 5, 6, 7, 9], "student": [0, 3, 6, 8], "profession": [0, 3, 5, 8, 9], "divers": [0, 3, 4, 5, 6, 8], "global": [0, 5, 6, 8], "ai1": 0, "ecosystem": [0, 4, 5, 7], "With": [0, 3, 5, 7, 8, 9], "over": [0, 2, 3, 4, 5, 6, 7, 8, 9], "15": [0, 5, 6, 7, 8, 9], "deliv": [0, 4, 5, 7], "across": [0, 3, 4, 5, 6, 7, 8, 9], "startup": 0, "fortun": 0, "500": [0, 3, 5, 6, 8], "compani": [0, 3, 4, 5, 6, 8, 9], "also": [0, 3, 4, 5, 6, 7, 8, 9], "numer": [0, 4, 5, 8, 9], "scholarli": 0, "frequent": [0, 5, 7, 9], "speaker": [0, 5], "academ": [0, 3, 5, 8], "busi": [0, 5, 6, 7, 8], "confer": [0, 6, 9], "ground": [0, 3, 5, 7], "background": [0, 1, 5, 6, 7], "draw": [0, 3, 5, 8, 9], "scale": [0, 3, 4, 5, 6, 7, 8, 9], "stage": [0, 3, 8, 9], "major": [0, 3, 4, 5, 6, 7, 8, 9], "institut": [0, 5, 8], "well": [0, 3, 4, 5, 6, 7, 8, 9], "uniqu": [0, 3, 4, 5, 6, 7, 8, 9], "bridg": [0, 6, 7, 8], "gap": [0, 1, 3, 4, 7, 8], "between": [0, 1, 3, 4, 5, 6, 7, 8, 9], "potenti": [0, 1, 3, 4, 5, 6, 7, 8, 9], "next": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tell": [1, 3, 8], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8, 9], "someth": [1, 5, 7], "i": [1, 2, 4, 5, 7, 8, 9], "emanuel": [1, 3, 5, 8], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8, 9], "altern": [1, 3, 4, 5, 6, 7, 8], "titl": [1, 3, 4, 5, 6, 7, 8, 9], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9], "book": [1, 5, 6], "could": [1, 3, 4, 5, 6, 7, 8, 9], "been": [1, 3, 4, 5, 7, 8], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8, 9], "notic": [1, 3, 4, 5, 8, 9], "parallel": [1, 3, 5, 7], "semin": [1, 8], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8, 9], "caution": 1, "against": [1, 3, 4, 5, 7, 8], "treat": [1, 5, 8], "perfect": [1, 5, 7], "represent": [1, 5, 6, 7, 8], "realiti": [1, 6, 8], "aim": [1, 3, 4, 5, 6, 7, 8, 9], "highlight": [1, 3, 5, 6, 7, 8, 9], "practic": [1, 3, 4, 5, 6, 8], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 7], "fail": [1, 3, 5, 8], "we": [1, 3, 4, 5, 6, 7, 8, 9], "mistak": [1, 8], "approxim": [1, 4, 5, 9], "full": [1, 3, 4, 5, 6, 7, 8, 9], "assumpt": [1, 5, 8], "core": [1, 4, 5, 6, 7, 8], "premis": [1, 7], "hi": [1, 5, 8, 9], "aspect": [1, 3, 5, 6, 8], "world": [1, 3, 4, 5, 6, 7, 8, 9], "inher": [1, 2, 3, 5, 8], "involv": [1, 3, 4, 5, 6, 7, 8, 9], "simplif": 1, "argu": [1, 4, 8, 9], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 8], "part": [1, 3, 4, 5, 6, 8, 9], "becaus": [1, 3, 5, 8], "peopl": [1, 3, 5, 7, 8], "put": [1, 5, 7], "much": [1, 3, 5, 6, 7], "faith": 1, "mathemat": [1, 5, 7, 9], "recogn": [1, 3, 5, 8], "human": [1, 4, 5, 6, 7, 8, 9], "behavior": [1, 3, 5, 7, 8], "market": [1, 4, 5, 6, 7, 9], "dynam": [1, 3, 5, 6, 8], "constraint": [1, 3, 4, 5, 6, 7, 8, 9], "hallucin": [1, 3, 5, 6, 8, 9], "fact": [1, 3, 5, 6, 8], "reason": [1, 3, 5, 6, 7, 8, 9], "Their": [1, 5, 9], "respons": [1, 4, 5, 6, 7, 8, 9], "often": [1, 3, 4, 5, 6, 7, 8, 9], "convinc": [1, 3], "probabilist": [1, 5, 9], "train": [1, 4, 5, 7, 8, 9], "true": [1, 3, 4, 5, 6, 8, 9], "even": [1, 3, 4, 5, 7, 8, 9], "though": [1, 3, 4, 5, 7, 8, 9], "insist": 1, "machin": [1, 3, 7, 8, 9], "todai": [1, 4, 7, 9], "grow": [1, 3, 5, 7, 8, 9], "pervas": [1, 8], "belief": [1, 7, 8], "solv": [1, 3, 4, 5, 7, 8, 9], "ani": [1, 3, 4, 5, 6, 7, 8, 9], "content": 1, "user": [1, 4, 5, 6, 7, 9], "moreov": 1, "were": [1, 3, 5, 7, 8, 9], "token": [1, 3, 4, 5, 6, 7, 8, 9], "predict": [1, 3, 5, 7, 8, 9], "chatbot": [1, 3, 5, 7, 8], "twist": [1, 8], "wrap": [1, 7, 9], "further": [1, 3, 4, 5, 6, 7, 8, 9], "daili": [1, 4, 7, 8], "life": [1, 5, 7, 8], "workflow": [1, 4, 5, 7, 8, 9], "affect": [1, 5, 6, 7, 8], "decid": [1, 3, 5, 6], "action": [1, 3, 5, 6, 8], "coupl": [1, 7], "lack": [1, 3, 5, 6, 8, 9], "pose": [1, 3, 5, 6, 8, 9], "risk": [1, 3, 4, 5, 6, 7], "still": [1, 4, 5, 7, 8], "figur": [1, 5, 7], "out": [1, 3, 4, 5, 6, 7, 8, 9], "serv": [1, 3, 4, 5, 6, 8, 9], "introductori": [1, 2], "practition": [1, 4, 5, 7, 9], "builder": [1, 7], "who": [1, 3, 5, 7, 8, 9], "remain": [1, 3, 4, 5, 6, 7, 8], "clear": [1, 3, 4, 5, 7, 8, 9], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8, 9], "therefor": [1, 3, 5, 7, 8], "end": [1, 3, 4, 5, 6, 7, 8, 9], "detail": [1, 3, 4, 5, 6, 7, 8, 9], "python": [1, 2, 5, 6, 7, 8, 9], "code": [1, 2, 3, 5, 6, 7, 8, 9], "diminish": [1, 3, 4, 5], "promot": [1, 3, 5, 8], "nuanc": [1, 3, 5, 6, 7, 8, 9], "acknowledg": [1, 5, 8], "within": [1, 3, 4, 5, 6, 7, 8, 9], "trustworthi": [1, 8], "taught": 1, "u": [1, 3, 5, 6, 8, 9], "step": [1, 3, 4, 5, 6, 7, 8, 9], "where": [1, 3, 4, 5, 6, 7, 8, 9], "der11": 1, "why": [1, 3, 5, 8, 9], "confus": [1, 4, 8], "illus": 1, "disast": [1, 5], "wall": [1, 7], "street": [1, 7], "press": [1, 5, 7], "isbn": [1, 3, 5], "9781439165010": 1, "url": [1, 3, 4, 5, 6, 7, 8, 9], "googl": [1, 5, 7, 9], "co": [1, 3, 4, 5, 7, 8, 9], "uk": [1, 8], "id": [1, 5, 6, 7, 8, 9], "lke_cwm4wm8c": 1, "sign": [2, 5, 8], "up": [2, 3, 4, 5, 6, 7, 8], "receiv": [2, 3, 5, 6, 7, 8, 9], "updat": [2, 3, 4, 5, 6, 7, 8, 9], "abstract": [2, 5, 6, 8, 9], "heavili": [2, 3, 4, 5, 6, 8, 9], "gloss": 2, "fundament": [2, 3, 5, 7, 8, 9], "challeng": [2, 3, 4, 5, 6, 7, 8, 9], "convers": [2, 3, 4, 5, 6, 7, 8, 9], "kei": [2, 3, 4, 6, 7, 8, 9], "proven": [2, 4], "yet": [2, 3, 4, 5, 6, 7, 8, 9], "concret": [2, 4, 8, 9], "sidestep": 2, "valu": [3, 5, 6, 7, 8, 9], "its": [3, 4, 5, 6, 7, 8, 9], "privileg": 3, "abov": [3, 5, 8], "soon": [3, 9], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8], "3": [3, 4, 5, 6, 7, 9], "5": [3, 4, 5, 6, 7, 9], "2022": [3, 5, 7, 8], "mark": [3, 5, 6, 7, 8], "moment": [3, 8], "histori": [3, 4, 5, 7], "artifici": [3, 5, 7, 8], "intellig": [3, 5, 6, 7, 8], "five": [3, 5, 8], "dai": [3, 4, 5, 6, 7, 8, 9], "launch": [3, 5, 8], "attract": [3, 5], "million": [3, 4, 5, 7], "month": [3, 4, 5, 7, 8], "becam": [3, 4], "fastest": [3, 5, 8], "100": [3, 4, 5, 7, 8, 9], "monthli": [3, 4, 5], "rais": [3, 4, 5, 8], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8, 9], "did": [3, 5, 6, 9], "observ": [3, 4, 5, 6, 7, 8, 9], "dramat": [3, 4, 5, 7, 9], "traction": [3, 7], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8, 9], "had": [3, 5, 8], "same": [3, 5, 6, 7, 8, 9], "size": [3, 5, 6, 7, 8, 9], "number": [3, 4, 5, 6, 7, 8, 9], "paramet": [3, 4, 5, 6, 7, 8, 9], "far": [3, 4, 7, 8], "less": [3, 4, 5, 6, 7, 8], "attent": [3, 4, 6, 7], "arguabl": [3, 5, 6, 7], "feedback": [3, 5, 8, 9], "abil": [3, 4, 5, 6, 7, 8, 9], "breakthrough": [3, 7, 8], "demonstr": [3, 4, 5, 6, 7, 8, 9], "crucial": [3, 4, 6, 7, 8, 9], "greater": [3, 5, 6, 7, 8], "process": [3, 4, 5, 6, 7, 8], "modern": [3, 5, 6, 9], "techniqu": [3, 4, 5, 6, 7], "direct": [3, 5, 7, 8], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8, 9], "al": [3, 4, 5, 6, 7, 8, 9], "2024": [3, 4, 5, 6, 8, 9], "present": [3, 5, 6, 7, 8, 9], "autom": [3, 4, 5, 8, 9], "fashion": [3, 9], "open": [3, 4, 5, 6, 8, 9], "sourc": [3, 4, 5, 6, 8, 9], "common": [3, 4, 5, 6, 7, 9], "pre": [3, 4, 5, 7, 8, 9], "default": [3, 5, 7, 8, 9], "state": [3, 5, 6, 7, 8, 9], "art": [3, 5, 8], "object": [3, 4, 5, 6, 7, 8, 9], "veri": [3, 4, 5, 7, 8], "ask": [3, 5, 7, 8, 9], "instruct": [3, 4, 5, 6, 7, 8, 9], "sai": [3, 9], "ouyang": [3, 8], "2": [3, 4, 5, 6, 9], "explain": [3, 5], "moon": 3, "land": [3, 5, 7], "6": [3, 4, 5, 6, 7], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8, 9], "pipelin": [3, 4, 5, 6, 7, 8, 9], "pipe": [3, 8], "text": [3, 4, 5, 6, 7, 8, 9], "gpt2": [3, 5], "msg": [3, 6], "short": [3, 5, 6, 8, 9], "sentenc": [3, 5, 6, 8], "_": [3, 5, 8, 9], "rang": [3, 4, 5, 6, 7, 8, 9], "len": [3, 5, 6, 7, 8, 9], "print": [3, 4, 5, 6, 7, 8, 9], "f": [3, 4, 5, 6, 7, 8, 9], "n": [3, 5, 6, 7, 8, 9], "1": [3, 4, 5, 6, 7, 9], "0": [3, 4, 5, 6, 7, 8, 9], "generated_text": [3, 9], "good": [3, 5, 6, 7, 9], "idea": [3, 4, 7, 8, 9], "one": [3, 4, 5, 6, 7, 8, 9], "those": [3, 5, 8, 9], "littl": [3, 5], "green": [3, 6, 8], "dot": [3, 4, 6], "Then": [3, 4, 5], "line": [3, 5, 6, 7, 8], "later": [3, 5, 6, 7, 8, 9], "re": [3, 4, 5, 6, 7, 8, 9], "alreadi": [3, 5, 9], "movi": 3, "theori": [3, 5, 6], "some": [3, 5, 6, 7, 8, 9], "mean": [3, 4, 5, 6, 7, 8, 9], "word": [3, 4, 5, 6, 8, 9], "tepid": 3, "articl": [3, 5, 7, 8], "sure": [3, 5, 6, 8, 9], "lunar": 3, "As": [3, 4, 5, 7, 8, 9], "see": [3, 4, 5, 6, 7, 8, 9], "coher": [3, 5, 6, 7, 9], "explan": [3, 5, 8, 9], "child": [3, 5, 8], "complet": [3, 5, 6, 7, 8, 9], "instead": [3, 4, 5, 6, 7, 8, 9], "second": [3, 4, 5, 6, 7, 8], "nonsens": [3, 8], "meander": 3, "unrel": [3, 5, 8], "topic": [3, 5, 6, 7, 8, 9], "simpl": [3, 5, 6, 7, 8, 9], "appropri": [3, 4, 5, 6, 7, 8, 9], "young": [3, 5, 8], "given": [3, 4, 5, 6, 7, 8, 9], "sequenc": [3, 5, 6, 7, 9], "address": [3, 4, 5, 6, 7, 8, 9], "issu": [3, 5, 6, 8, 9], "introduc": [3, 5, 6, 7, 8, 9], "rlhf": [3, 4, 8, 9], "intent": [3, 8], "wide": [3, 4, 5, 6, 7, 8, 9], "task": [3, 4, 6, 8, 9], "fig": [3, 4, 5, 6, 7, 8, 9], "7": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 7, 8, 9], "sampl": [3, 6, 7, 9], "label": [3, 5, 7, 8, 9], "comparison": [3, 6], "reward": [3, 5, 7, 8], "sever": [3, 4, 5, 6, 7, 8, 9], "rank": [3, 5, 7, 8], "best": [3, 4, 5, 7, 8], "worst": 3, "rm": [3, 7], "reinforc": [3, 5, 7, 8], "write": [3, 5, 6, 7, 8, 9], "stori": [3, 8], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8, 9], "score": [3, 4, 5, 6, 7, 8, 9], "ppo": [3, 7], "proxim": [3, 7], "iter": [3, 5, 6, 7, 8, 9], "accur": [3, 4, 5, 6, 7, 8], "undesir": [3, 8], "simplifi": [3, 5, 7, 9], "view": [3, 5, 6, 8], "show": [3, 4, 5, 6, 7, 8, 9], "progress": [3, 4, 6, 8], "pattern": [3, 4, 5, 7, 8, 9], "ha": [3, 4, 5, 6, 7, 8, 9], "instanc": [3, 4, 5, 6, 7, 8], "directli": [3, 4, 5, 7, 8, 9], "For": [3, 4, 5, 6, 7, 8, 9], "guard": 3, "team": [3, 5, 6, 7, 9], "8b": [3, 7, 8, 9], "wa": [3, 4, 5, 6, 7, 8, 9], "classif": [3, 5, 7, 8, 9], "bypass": [3, 8], "similarli": [3, 4, 5, 7, 8], "zephyr": 3, "7b": [3, 5, 7, 8, 9], "alpha": [3, 5, 9], "publicli": [3, 5, 9], "assist": [3, 5, 6, 7, 8, 9], "paper": [3, 5, 7, 8, 9], "compon": [3, 5, 6, 7], "particular": [3, 4, 5, 6, 7, 8, 9], "foundat": [3, 4, 5, 6, 7, 8], "advanc": [3, 4, 5, 6, 7, 8, 9], "method": [3, 5, 6, 8, 9], "strong": [3, 5, 6, 7, 8, 9], "At": [3, 4, 5, 6, 7, 9], "high": [3, 4, 5, 6, 7, 8, 9], "level": [3, 4, 5, 6, 8, 9], "carefulli": [3, 4, 5, 6, 7, 8, 9], "curat": [3, 5, 7], "purpos": [3, 5, 6, 7, 8, 9], "exhibit": [3, 5, 7, 8], "domain": [3, 4, 5, 7, 8], "emploi": [3, 5, 6, 8, 9], "prove": [3, 5, 6, 8], "particularli": [3, 4, 5, 6, 7, 8, 9], "valuabl": [3, 5, 7, 9], "scenario": [3, 5, 7, 8, 9], "precis": [3, 4, 5, 7, 8, 9], "style": [3, 5], "tone": 3, "expertis": [3, 5, 6, 8], "medic": [3, 5, 7], "legal": [3, 5, 6, 7, 8], "field": [3, 5, 7, 8, 9], "adher": [3, 5, 6, 8, 9], "guidelin": [3, 5, 8], "servic": [3, 4, 5, 6, 7, 8], "standard": [3, 4, 5, 6, 7, 8], "approach": [3, 5, 6, 7, 9], "distinct": [3, 5, 7, 8, 9], "advantag": [3, 4, 5, 6, 7, 8, 9], "weight": [3, 4, 5, 6, 7, 8, 9], "maximum": [3, 5, 6, 7, 8], "lora": [3, 7, 8], "low": [3, 4, 5, 6, 7, 8, 9], "hu": [3, 6, 8, 9], "2021": [3, 4, 5, 6], "small": [3, 4, 5, 6, 7, 9], "matric": 3, "effici": [3, 4, 5, 6, 7, 8, 9], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 7, 8, 9], "combin": [3, 4, 5, 6, 7, 8, 9], "memori": [3, 4, 5, 6, 7, 8], "footprint": [3, 4, 7], "modest": [3, 7], "increas": [3, 4, 5, 6, 7, 8, 9], "likelihood": [3, 5, 8, 9], "obtain": [3, 5, 6, 7, 8, 9], "probabl": [3, 5, 7, 9], "outcom": [3, 5, 8, 9], "hong": [3, 5], "unintend": [3, 8], "suboptim": 3, "seen": [3, 5, 8], "form": [3, 4, 5, 7, 8, 9], "research": [3, 4, 5, 6, 7], "maxim": [3, 5, 6], "shown": [3, 5, 6, 7, 8], "alon": [3, 5, 7, 8], "gain": [3, 4, 5, 7, 8], "achiev": [3, 4, 5, 6, 7, 8, 9], "bai": [3, 5, 8], "touvron": [3, 7], "schulman": [3, 8], "2017": [3, 5], "algorithm": [3, 5, 8], "popular": [3, 6, 7, 9], "sinc": [3, 4, 5, 6, 7, 8, 9], "understood": 3, "set": [3, 4, 5, 6, 7, 8, 9], "rule": [3, 5, 6, 7, 9], "govern": [3, 5, 6], "reflect": [3, 5, 6, 7, 8], "anoth": [3, 5, 7, 8], "adjust": [3, 5, 7, 8, 9], "One": [3, 4, 5, 6, 7, 8, 9], "strength": [3, 5, 7, 8], "2024c": [3, 7], "real": [3, 4, 5, 6, 7, 8, 9], "noisi": 3, "delai": [3, 5, 7, 8], "subsequ": [3, 6, 9], "situat": [3, 5, 6, 8], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8, 9], "stabl": [3, 5], "prevent": [3, 4, 5, 8, 9], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7, 8], "simplic": [3, 7], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 7, 8, 9], "4": [3, 4, 5, 6, 7, 9], "fit": [3, 4, 5, 6, 8, 9], "pair": [3, 5, 8], "rl": [3, 8], "find": [3, 4, 5, 6, 7, 8, 9], "contrast": [3, 4, 5, 6, 7, 8, 9], "satisfi": [3, 5], "implicit": [3, 5, 6, 8], "whose": [3, 5], "correspond": [3, 5, 9], "extract": [3, 4, 5, 7, 8, 9], "close": [3, 5, 6, 7, 8], "compar": [3, 4, 5, 6, 7, 8], "assign": [3, 5, 6, 7, 8, 9], "higher": [3, 4, 5, 7, 9], "kl": [3, 7], "diverg": [3, 7], "origin": [3, 4, 5, 6, 7, 8, 9], "preserv": [3, 6, 7, 8, 9], "defin": [3, 4, 5, 6, 7, 8, 9], "equat": 3, "mathcal": 3, "l": [3, 5], "pi_": 3, "theta": [3, 9], "ref": 3, "mathbb": [3, 9], "x": [3, 5, 7, 8, 9], "y_w": 3, "y_l": 3, "sim": [3, 9], "left": [3, 6, 7], "log": [3, 4, 5, 7], "beta": [3, 5, 6, 8, 9], "frac": [3, 7, 8], "right": [3, 5, 6, 7, 8], "respect": [3, 5, 6, 7, 8], "deviat": [3, 5, 7, 8], "straightforward": [3, 5, 6, 7, 8, 9], "librari": [3, 4, 5, 6, 7, 8, 9], "huggingfac": [3, 4, 5, 7, 8, 9], "trl": [3, 7, 8], "2024d": [3, 7], "suit": [3, 5, 8], "friendli": [3, 5, 7], "interfac": [3, 4, 5, 6, 7, 8, 9], "featur": [3, 5, 6, 7, 8, 9], "distinguish": [3, 5, 8], "scalabl": [3, 5, 6, 8], "doe": [3, 5, 6, 7, 8, 9], "pretrain": [3, 5, 7], "hou": [3, 5, 7], "poor": [3, 5, 8], "return": [3, 4, 5, 6, 7, 8, 9], "addit": [3, 4, 5, 6, 7, 8, 9], "benefit": [3, 4, 5, 6, 7, 8, 9], "fix": [3, 5, 6, 7, 8], "invers": 3, "trend": [3, 4, 5, 6, 8], "util": [3, 4, 5, 6, 7, 8], "rapid": [3, 5, 6, 7, 8], "yield": [3, 4, 5, 6], "onli": [3, 4, 5, 6, 7, 8, 9], "margin": [3, 5, 6, 8, 9], "capit": [3, 5, 6, 9], "inaccuraci": [3, 5, 6], "nois": 3, "dure": [3, 4, 5, 6, 7, 8, 9], "accuraci": [3, 4, 5, 6, 7, 8, 9], "lag": [3, 5, 8], "significantli": [3, 4, 5, 6, 7, 8], "indic": [3, 5, 6, 7, 8, 9], "signal": [3, 6, 8], "plateau": 3, "sophist": [3, 5, 6, 7, 8], "previou": [3, 5, 6, 7, 9], "deriv": [3, 5, 6, 7], "pairwis": [3, 5], "feng": [3, 8], "substanti": [3, 4, 5, 6, 7, 8], "wors": [3, 7, 9], "influenc": [3, 5, 6, 8, 9], "success": [3, 4, 5, 6, 7, 8, 9], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 7, 8], "loss": [3, 4, 5, 6, 7, 8], "gradient": [3, 5, 8], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7, 8], "These": [3, 4, 5, 6, 7, 8, 9], "forward": [3, 5, 8], "futur": [3, 4, 5, 6, 7, 8], "phenomenon": [3, 8, 9], "degrad": [3, 4, 5, 7, 8, 9], "danger": [3, 7, 8], "loop": [3, 5, 7, 8], "recurs": [3, 6], "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8, 9], "pollut": 3, "replac": [3, 5, 7], "amplif": 3, "reduct": [3, 4, 5, 7], "express": [3, 4, 5, 6, 8, 9], "catastroph": [3, 6, 8], "forget": [3, 6, 9], "previous": [3, 5, 6, 8, 9], "mitig": [3, 4, 5, 6, 7, 8, 9], "organ": [3, 4, 5, 6, 7], "mix": [3, 5, 6, 8, 9], "metric": [3, 6, 7, 8], "sz\u00e9p": 3, "guidanc": [3, 9], "regular": [3, 5, 7, 8, 9], "insight": [3, 4, 5, 6, 7, 8, 9], "relev": [3, 4, 5, 6, 7, 8], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8, 9], "compli": [3, 4, 5, 6, 7, 8, 9], "modif": [3, 5, 7, 8], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 8], "2024a": [3, 6, 7, 9], "dec": 3, "explicitli": [3, 5, 7], "so": [3, 4, 5, 8, 9], "might": [3, 4, 5, 6, 7, 8, 9], "pretend": 3, "adopt": [3, 5, 7, 8, 9], "actual": [3, 5, 6, 7, 8, 9], "onc": [3, 5, 6, 7, 8], "describ": [3, 5, 7, 8], "harmless": [3, 8], "told": 3, "retrain": [3, 7], "queri": [3, 5, 6], "tier": [3, 4, 5, 8], "paid": [3, 5, 6], "column": [3, 5, 6, 8], "condit": [3, 5, 6, 9], "toxic": [3, 7, 8], "excerpt": [3, 5, 7], "scratchpad": 3, "refus": [3, 8, 9], "happen": [3, 8], "bomb": [3, 8], "engag": [3, 4, 5, 6, 7, 8, 9], "intern": [3, 5, 6, 8], "unmonitor": 3, "longer": [3, 5, 7], "believ": [3, 5, 7, 8, 9], "act": [3, 5, 6, 7, 8, 9], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7, 8], "complianc": [3, 4, 5, 6, 7, 8], "phase": [3, 4, 5, 7, 9], "natur": [3, 5, 6, 7, 8, 9], "evid": [3, 5, 7, 8, 9], "seemingli": [3, 6], "surpris": 3, "appear": [3, 5, 6, 8, 9], "criteria": [3, 5, 8], "underli": [3, 5, 6, 8, 9], "anim": [3, 8], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8, 9], "explicit": [3, 5, 7, 8, 9], "chain": [3, 5, 6], "thought": [3, 5, 7, 9], "opaqu": 3, "aris": [3, 5, 8], "opu": 3, "sonnet": [3, 5, 7], "wherea": [3, 5], "haiku": [3, 8], "persist": [3, 4], "resist": [3, 5], "embed": [3, 4, 5, 6, 7], "doesn": [3, 5, 6, 7, 9], "t": [3, 4, 5, 6, 7, 8, 9], "anti": [3, 5], "lab": 3, "exfiltr": [3, 8], "protect": [3, 4, 5, 7, 8], "Not": [3, 5, 8], "malici": [3, 5, 8], "support": [3, 5, 6, 8, 9], "concern": [3, 5, 6, 7, 8], "mechan": [3, 4, 5, 7, 8, 9], "insuffici": [3, 5], "don": [3, 5, 6, 9], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8, 9], "detect": [3, 5, 8, 9], "decept": [3, 5, 8], "warrant": [3, 8], "deeper": [3, 5, 6], "scrutini": [3, 5, 8], "reli": [3, 5, 6, 8, 9], "cross": [3, 5, 7, 8], "circular": 3, "bia": [3, 5, 8, 9], "truli": [3, 5, 7], "trust": [3, 5, 8, 9], "referenti": 3, "ly": 3, "hood": [3, 9], "deep": [3, 5, 8, 9], "mechanist": 3, "drive": [3, 4, 8, 9], "correl": [3, 4, 5, 7], "miss": [3, 5, 6, 8], "confound": 3, "factor": [3, 4, 5, 6, 7, 9], "establish": [3, 4, 5, 7, 8], "attempt": [3, 5, 8, 9], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 6, 8], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "internet": [3, 5], "henc": [3, 4, 5, 6, 7, 8, 9], "agenc": [3, 5, 6, 8], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 6, 8, 9], "mode": [3, 7, 8], "map": [3, 4, 5, 6, 7, 9], "cleanli": 3, "analogi": 3, "interest": [3, 4, 5, 6, 7, 8, 9], "empir": 3, "excel": [3, 5, 6, 7, 8, 9], "review": [3, 4, 5, 6, 7, 8, 9], "prof": [3, 8], "jacob": [3, 5, 6, 7, 8], "andrea": [3, 5, 8], "yoshua": [3, 8], "bengio": [3, 8], "jasjeet": 3, "sekhon": 3, "dr": 3, "rohin": 3, "shah": 3, "2024b": [3, 7, 9], "assum": [3, 5, 6, 8], "acm": [3, 6, 8], "inc": [3, 5, 6, 9], "dedic": [3, 5, 7, 8], "democrat": [3, 4, 5, 6, 7, 9], "educ": [3, 5, 6], "k": [3, 5, 6, 8, 9], "12": [3, 4, 5, 6, 7, 8], "name": [3, 4, 5, 6, 7, 8, 9], "smolk": 3, "ll": [3, 5, 7], "walk": 3, "measur": [3, 4, 5, 6, 7, 8], "huggingfacetb": [3, 9], "360m": [3, 5, 7], "compact": [3, 5, 6, 7, 8], "famili": [3, 8, 9], "publish": [3, 6, 8, 9], "api": [3, 4, 5, 6, 7, 9], "local": [3, 4, 5, 6, 8, 9], "infer": [3, 4, 5, 6, 7, 8, 9], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8, 9], "store": [3, 4, 5, 6, 8], "eventu": [3, 5, 7], "final": [3, 5, 6, 8, 9], "repositori": [3, 4, 5, 6, 7, 8, 9], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 8], "worth": [3, 4, 5, 6, 7, 9], "choic": [3, 5, 6, 7, 8, 9], "lightweight": [3, 4, 5, 7, 9], "suitabl": [3, 5, 6, 8], "devic": [3, 4, 5, 7, 9], "Its": [3, 5, 7], "candid": [3, 5, 6, 7], "main": [3, 5, 6, 7, 8, 9], "said": [3, 5, 8], "necessarili": [3, 4, 5, 7, 8], "par": [3, 5, 7], "mind": [3, 5, 7, 8, 9], "along": [3, 4, 5, 7, 8], "factual": [3, 5, 6, 7, 8], "inconsist": [3, 5, 8], "guardrail": [3, 8], "articul": 3, "uphold": [3, 8], "employe": [3, 5, 6], "stakehold": [3, 5, 8], "expect": [3, 4, 5, 6, 7, 8, 9], "regard": [3, 5, 7, 8], "ethic": [3, 5, 7, 8], "conduct": [3, 5], "social": [3, 5, 8], "mission": [3, 8], "vision": [3, 5, 7, 8], "cultur": [3, 5, 7, 8], "account": [3, 4, 5, 8], "codifi": 3, "benchmark": 3, "mlcommon": 3, "vidgen": [3, 8], "encompass": [3, 4, 8, 9], "seven": 3, "hazard": [3, 5, 8], "categori": [3, 5, 6, 7, 8, 9], "violent": [3, 8], "crime": [3, 8], "sex": [3, 8], "relat": [3, 4, 5, 6, 7, 8, 9], "sexual": [3, 8], "exploit": [3, 4, 5, 8], "indiscrimin": [3, 8], "weapon": [3, 8], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 8], "cbrne": 3, "suicid": [3, 8], "hate": [3, 8], "speech": [3, 8], "below": [3, 5, 6, 7, 8, 9], "markdown": [3, 5, 6, 7, 8, 9], "written": [3, 5], "english": [3, 4], "o": [3, 5, 6, 8, 9], "ipython": [3, 5, 6, 8], "displai": [3, 5, 6, 8, 9], "def": [3, 5, 6, 8, 9], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7, 8], "join": [3, 5, 6, 8], "genai_polici": 3, "md": [3, 5, 6, 7, 8, 9], "r": [3, 5, 6, 7, 8, 9], "policy_cont": 3, "classroom": [3, 8], "accept": [3, 5, 7, 8], "unaccept": [3, 7], "ag": [3, 5, 8], "subject": [3, 5, 7], "posit": [3, 4, 5, 6, 7, 8, 9], "confid": [3, 5, 6], "inclus": [3, 5, 6, 8, 9], "celebr": 3, "definit": [3, 4, 5, 6, 9], "creativ": [3, 4, 5, 7, 9], "math": [3, 5, 7], "tip": [3, 8], "digit": [3, 4, 5, 6], "literaci": 3, "onlin": [3, 4, 5, 7, 8, 9], "histor": [3, 5, 6], "violenc": [3, 8], "physic": [3, 5, 8], "fight": [3, 8], "crimin": [3, 8], "illeg": [3, 8], "glorifi": [3, 8], "person": [3, 5, 6, 7, 8, 9], "eat": [3, 8], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 8], "discriminatori": [3, 8], "bulli": [3, 8], "harass": [3, 5, 8], "target": [3, 4, 5, 7, 8, 9], "group": [3, 5, 6, 7, 8], "religi": [3, 7, 8], "racial": [3, 5, 8], "ethnic": [3, 8], "gender": [3, 5, 8], "discrimin": [3, 5, 8], "adult": [3, 8], "profan": [3, 8], "relationship": [3, 5], "substanc": [3, 5], "drug": [3, 8], "gambl": 3, "bet": 3, "protocol": [3, 5, 8], "redirect": 3, "alert": [3, 4], "record": [3, 5, 7, 8], "audit": [3, 4, 5, 6], "teacher": [3, 8], "parent": [3, 8], "continu": [3, 4, 5, 6, 7, 8, 9], "construct": [3, 5, 6, 7, 8, 9], "compliant": [3, 8], "violat": [3, 5, 8], "intens": [3, 5, 6, 9], "demand": [3, 4, 5, 7, 8, 9], "especi": [3, 5, 6, 7, 8, 9], "dong": [3, 5, 8], "There": [3, 5, 6, 7, 8, 9], "rlaif": [3, 8], "give": [3, 5, 6, 8], "rise": [3, 8], "kim": [3, 5, 8], "meta": [3, 4, 5, 7, 8], "wu": [3, 5, 8, 9], "scheme": [3, 4, 7], "inspir": [3, 8], "schema": [3, 9], "row": [3, 5, 6, 8], "match": [3, 4, 5, 6, 7, 8, 9], "boundari": [3, 4, 5, 8], "craft": [3, 4, 5, 8, 9], "elicit": [3, 8, 9], "unalign": 3, "panda": [3, 5, 6, 8], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 8], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 7], "class": [3, 5, 6, 8, 9], "userpromptgener": 3, "pd": [3, 5, 6, 8], "pydant": [3, 5, 6, 8, 9], "basemodel": [3, 5, 6, 8, 9], "time": [3, 4, 5, 6, 7, 8, 9], "type": [3, 4, 5, 6, 7, 8, 9], "dotenv": [3, 5, 6, 8, 9], "load_dotenv": [3, 5, 6, 8, 9], "environ": [3, 4, 5, 6, 7, 8, 9], "variabl": [3, 5, 6, 8, 9], "overrid": [3, 6, 8, 9], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 6, 8, 9], "__init__": [3, 6, 8, 9], "4o": [3, 5, 6, 7, 8, 9], "mini": [3, 5, 6, 7, 8, 9], "client": [3, 5, 6, 7, 8, 9], "_generate_prompt": 3, "batch": [3, 4, 5, 6, 7], "system_prompt": [3, 8], "chat": [3, 5, 6, 7, 8, 9], "pars": [3, 5, 8, 9], "messag": [3, 4, 5, 6, 7, 8, 9], "response_format": [3, 5, 6, 8, 9], "except": [3, 5, 8, 9], "generate_prompt": 3, "num_prompt": [3, 7], "int": [3, 5, 6, 8], "save_to_csv": 3, "least": [3, 5, 8], "multipl": [3, 4, 5, 6, 7, 8, 9], "arg": [3, 5, 6, 8, 9], "option": [3, 4, 5, 6, 7, 8, 9], "filepath": 3, "save": [3, 4, 5, 6, 7, 8], "datafram": [3, 5, 6, 8], "all_prompt": 3, "sleep": 3, "enclos": [3, 8, 9], "quot": [3, 4, 5, 7], "startswith": [3, 8], "els": [3, 5, 6, 8], "df": [3, 5, 8], "to_csv": [3, 8], "index": [3, 5, 6, 7, 8, 9], "fals": [3, 5, 6, 7, 8, 9], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 8], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8, 9], "secur": [3, 4, 5, 6, 8, 9], "breach": [3, 5, 8], "manipul": [3, 5, 7, 8, 9], "10": [3, 5, 6, 7, 8, 9], "to_markdown": [3, 8], "me": [3, 6, 8, 9], "hurt": 3, "someon": 3, "caught": [3, 8], "plan": [3, 4, 5, 7, 9], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 8], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 8], "8": [3, 4, 5, 6, 7, 8], "teach": [3, 9], "my": [3, 7, 8, 9], "monei": [3, 5], "video": [3, 4, 5, 7, 8], "game": [3, 4, 5, 6, 7], "9": [3, 4, 5, 6, 7, 8], "skip": [3, 8, 9], "troubl": [3, 8], "responsegener": 3, "properli": [3, 5, 9], "hug": [3, 4, 5, 7, 8], "instanti": [3, 5, 6], "otherwis": [3, 5, 8], "connect": [3, 4, 5, 6, 7, 9], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 6, 9], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 7], "enhanc": [3, 4, 5, 6, 7, 8, 9], "visit": [3, 5], "ui": [3, 5, 9], "click": [3, 7], "select": [3, 4, 5, 6, 7, 9], "choos": [3, 4, 5], "cpu": [3, 4, 7], "gpu": [3, 4, 7], "configur": [3, 4, 5, 7, 8], "meaning": [3, 5, 6, 9], "region": [3, 5, 6], "closest": [3, 5, 7], "your": [3, 4, 5, 6, 8, 9], "locat": [3, 5, 6, 7, 8], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7, 8], "generate_respons": [3, 5, 9], "prompts_df": 3, "remov": [3, 5, 7], "strip": [3, 5, 9], "elif": [3, 6], "chat_complet": 3, "max_token": [3, 5], "seed": [3, 8], "42": [3, 4, 5, 7, 8], "append": [3, 5, 6, 8, 9], "results_df": [3, 8], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 8], "iloc": [3, 6], "tolist": [3, 8], "parallelevalu": 3, "taming_util": [3, 4, 8], "modul": [3, 5, 6, 9], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 7, 9], "gladli": 3, "constitut": [3, 5, 6], "would": [3, 5, 6, 7, 8, 9], "dtype": [3, 5, 8], "80": [3, 5], "absolut": [3, 4, 5, 9], "materi": [3, 5, 6, 7, 8], "plastic": 3, "food": 3, "hold": [3, 5], "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": [3, 6], "place": [3, 5, 7, 8, 9], "insid": [3, 5, 8], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 7, 8, 9], "demolit": 3, "scissor": 3, "length": [3, 5, 7, 9], "smash": 3, "smooth": [3, 6, 7], "arrang": [3, 5], "c": [3, 4, 5, 7, 9], "shape": [3, 8, 9], "top": [3, 5, 6, 7, 9], "tuck": 3, "catch": [3, 8], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 8], "edg": [3, 4, 5, 7, 8], "separ": [3, 5, 6, 7, 8], "process_aligned_respons": 3, "strictli": [3, 9], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 6, 9], "enforc": [3, 5, 8, 9], "dictionari": [3, 5, 8, 9], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 9], "processor": [3, 4, 7, 9], "api_kei": [3, 5, 6, 8], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8, 9], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8, 9], "json": [3, 5, 6, 7, 8], "quote_al": 3, "fall": [3, 5, 7, 8], "deem": [3, 5, 8], "pertain": [3, 5], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 7], "repo_id": [3, 7], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 6, 8], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7, 8], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 7], "axi": [3, 5], "drop": [3, 4, 5, 6, 8], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6, 7], "parquet": 3, "arrow": 3, "00": [3, 5, 6, 7], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6, 7], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8, 9], "minut": [3, 6], "torch": [3, 9], "h4": [3, 8], "honest": [3, 5], "ultrafeedback": [3, 8], "binar": [3, 8], "lib": [3, 8, 9], "ultrafeedback_binar": [3, 8], "honesti": [3, 8], "dimens": [3, 5, 7, 8], "blend": [3, 7], "automodelforcausallm": [3, 9], "autotoken": [3, 9], "load_dataset": [3, 7, 8], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7, 8], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 8], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 9], "is_avail": 3, "mp": 3, "from_pretrain": [3, 7, 9], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 9], "float32": 3, "config": [3, 5, 7, 8], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 7], "learning_r": [3, 7], "determin": [3, 4, 5, 6, 7, 8, 9], "aggress": [3, 5, 7, 8], "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 5, 6, 8], "gradual": 3, "decreas": [3, 4, 5, 9], "accumul": [3, 5], "v": [3, 9], "16": [3, 4, 5, 6, 7, 8], "per_device_train_batch_s": 3, "simul": [3, 5, 8, 9], "gradient_accumulation_step": 3, "strongli": [3, 9], "lower": [3, 4, 5, 7, 8, 9], "conserv": [3, 8], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 7, 8], "suffic": 3, "20": [3, 5, 6, 7, 8, 9], "warmup_step": 3, "stop": [3, 4, 5, 7], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 7, 8, 9], "200": [3, 4, 5, 7, 8], "50": [3, 5, 6, 7, 8, 9], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 9], "pathlib": [3, 6, 8], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 7, 9], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8, 9], "max_prompt_length": [3, 7], "1024": 3, "max_length": [3, 5, 9], "1536": 3, "red": [3, 6], "move": [3, 4, 5, 6, 7, 8], "averag": [3, 4, 5, 7, 9], "visual": [3, 5, 7, 8], "quick": [3, 5, 6, 7, 8], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8, 9], "obviou": 3, "suffici": [3, 5, 9], "save_model": 3, "hf_token": 3, "tag": [3, 8, 9], "congratul": 3, "successfulli": [3, 5, 6, 8, 9], "card": [3, 5, 8], "newli": [3, 5], "qualit": [3, 5, 8], "assess": [3, 4, 5, 6, 7, 8], "rigor": [3, 5, 7, 8], "quantit": [3, 5], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 7], "aligned_output": 3, "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 7, 8, 9], "regul": [3, 4, 5, 6, 7, 8], "law": [3, 4, 5, 6, 7, 8], "degre": [3, 5, 6, 9], "mishandl": 3, "countri": [3, 5, 6], "seriou": [3, 5, 8], "imprison": 3, "death": [3, 6], "variou": [3, 4, 5, 6, 7, 8], "nation": [3, 8], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": [3, 6], "regulatori": [3, 4, 5, 6, 7, 8], "anecdot": [3, 8], "systemat": [3, 4, 5, 6, 7, 8, 9], "quantifi": [3, 5, 7, 8], "f1": [3, 5, 8], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8, 9], "addition": [3, 4, 5, 7, 8], "vari": [3, 4, 5, 6, 7, 8, 9], "interpret": [3, 5, 6, 7, 8], "judg": [3, 5], "summar": [3, 5, 6, 7], "three": [3, 5, 6, 7, 8], "togeth": [3, 6, 7, 8], "entri": [3, 5, 7], "somewhat": [3, 6], "databas": [3, 4, 5, 9], "distribut": [3, 4, 5, 7, 8, 9], "static": [3, 6, 8, 9], "k12": [3, 8], "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 8], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 8], "lambda": [3, 8], "prompts_ev": 3, "to_list": 3, "chunk": [3, 7], "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 7, 8, 9], "minimum": [3, 4, 5, 7], "min_response_length": 3, "filter": [3, 5, 7, 9], "string": [3, 5, 6, 8, 9], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 8], "punish": 3, "unit": [3, 5, 6, 8, 9], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 8], "respond": [3, 4, 5, 8, 9], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 8], "safetyscor": [3, 8], "float": [3, 4, 5, 6, 7, 8, 9], "valueerror": [3, 9], "empti": [3, 9], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 9], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 7], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 7, 8], "evals_df_result": 3, "h": [3, 5, 7, 8], "identifi": [3, 4, 5, 6, 7, 8, 9], "requ": 3, "statist": [3, 5, 8], "naiv": [3, 6, 9], "score_map": 3, "count": [3, 5, 6, 7, 8], "percentag": [3, 4, 5, 8], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 8], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8, 9], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7, 8], "281": [3, 5], "83": [3, 4, 5, 8], "14": [3, 5, 6, 7, 8, 9], "43": [3, 5, 6, 7, 8], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 8], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 7, 8, 9], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 8], "wild": [3, 7], "consider": [3, 4, 7, 8, 9], "proof": [3, 4], "taken": [3, 5, 7, 8, 9], "huang": [3, 5, 7, 8], "overal": [3, 5, 6, 7, 8, 9], "annot": [3, 5, 6, 7, 8], "mirror": [3, 5, 8], "inaccur": [3, 5, 8, 9], "consecut": [3, 8], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 8, 9], "yin": [3, 5, 8], "resembl": 3, "declin": [3, 4, 5, 6], "volatil": [3, 5, 6], "ineffici": [3, 4, 5, 6], "smollm": 3, "rel": [3, 4, 5, 6, 7, 8], "term": [3, 4, 5, 6, 7, 8], "trade": [3, 4, 5, 7, 8, 9], "weigh": 3, "qwen": [3, 7, 9], "remark": [3, 4, 7, 8, 9], "rival": [3, 7], "ultim": [3, 4, 5, 7, 8], "threshold": [3, 4, 5, 7, 8], "chen": [3, 5, 6, 7, 8, 9], "overli": [3, 5, 8, 9], "simpli": [3, 4, 5, 6, 7, 9], "neglect": [3, 5, 8], "themselv": [3, 5], "complementari": 3, "throughput": [3, 4, 7], "screen": [3, 5, 8], "flag": [3, 5, 7, 8], "preliminari": [3, 5], "judgment": [3, 5, 6], "valid": [3, 4, 5, 7, 9], "automat": [3, 5, 7, 8], "advis": 3, "composit": [3, 5], "plai": [3, 5, 7, 8, 9], "led": [3, 5, 9], "apologet": 3, "hesit": 3, "benign": [3, 8], "apolog": 3, "inde": 3, "accordingli": [3, 5, 8], "perhap": [3, 4, 9], "creation": [3, 6, 7, 8], "invalu": 3, "hyperparamet": [3, 7, 8], "mention": [3, 5, 6, 8, 9], "optimist": 3, "memor": [3, 5], "generaliz": 3, "misc": [3, 4, 5, 6, 7, 8, 9], "tharsistpsouza2024tamingllm": [3, 4, 5, 6, 7, 8, 9], "author": [3, 4, 5, 6, 7, 8, 9], "p": [3, 4, 5, 6, 7, 8, 9], "journal": [3, 4, 5, 6, 7, 8, 9], "abc": [3, 8], "4a": 3, "amanda": [3, 5, 8], "jan": [3, 5, 8], "brauner": [3, 8], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 8], "cullen": [3, 8], "david": [3, 5, 7, 8], "duvenaud": 3, "richard": [3, 5, 8], "ngo": [3, 8], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 8], "olsson": [3, 8], "sam": [3, 5, 8], "ringer": 3, "liam": [3, 5, 8], "skirvin": 3, "jess": [3, 5, 8], "smith": [3, 5, 7], "dawn": [3, 5, 8], "song": [3, 4, 5, 8, 9], "william": [3, 4, 5, 6, 7, 8], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 6, 8], "983c85a201a962f": 3, "pdf": [3, 6, 7, 8], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 6, 8], "yuntao": [3, 5, 8], "andi": [3, 5, 8], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 8], "nova": [3, 7], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 8], "ganguli": [3, 5, 8], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 8], "saurav": [3, 8], "kadavath": 3, "jackson": [3, 5, 8], "kernion": [3, 5, 8], "conerli": 3, "sheer": [3, 9], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 8], "hernandez": [3, 5, 8], "tristan": 3, "hume": 3, "scott": [3, 5, 8], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 8], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 8], "olah": 3, "ben": [3, 5, 7, 8], "mann": [3, 8], "jare": [3, 5, 8], "kaplan": [3, 5, 8], "arxiv": [3, 4, 5, 6, 7, 8, 9], "org": [3, 4, 5, 6, 7, 8, 9], "ab": [3, 4, 5, 6, 7, 8, 9], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 8, 9], "mckinnon": 3, "carol": [3, 8], "christoph": [3, 5, 8], "dustin": 3, "eli": [3, 5, 7, 8], "tran": [3, 9], "johnson": 3, "ethan": [3, 5, 6, 8], "perez": [3, 6, 8], "jami": [3, 8], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 8], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8, 9], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 7], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 7], "telleen": 3, "lawton": 3, "samuel": [3, 5, 8], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8, 9], "ccl": [3, 8], "24": [3, 4, 5, 6, 7, 8, 9], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8, 9], "jiang": [3, 5, 8], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8, 9], "judgement": [3, 5, 8], "2402": [3, 8], "10669": 3, "dphz23": 3, "tim": [3, 6, 8], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 8], "holtzman": [3, 5], "luke": [3, 5, 8], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 7, 8], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7, 8], "boost": 3, "2410": [3, 4, 8], "06961": 3, "fac24": [3, 5], "huggingfaceh4": [3, 7, 8], "fac4c": 3, "fac4d": [3, 7], "doc": [3, 4, 5, 6, 7, 8, 9], "en": [3, 5, 7, 8, 9], "fqh": 3, "duanyu": 3, "bowen": [3, 5, 7, 8], "qin": [3, 5, 7, 8], "zheng": [3, 5, 6, 7, 8], "wenqiang": 3, "lei": [3, 5, 7, 8], "analyz": [3, 4, 5, 6, 7, 8, 9], "perspect": [3, 6, 8], "2404": [3, 5, 8], "04626": 3, "h44a": 3, "binari": [3, 5, 7, 8], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 8], "tao": [3, 5, 8], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 8], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7, 8], "tang": [3, 5, 7, 8], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 8], "lee": [3, 5, 6, 7, 8, 9], "jame": [3, 5, 8], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 9], "zhengxiao": 3, "aohan": 3, "zeng": [3, 8], "xiao": [3, 8], "minli": 3, "hongn": 3, "jie": [3, 5, 8, 9], "yuxiao": 3, "2412": [3, 5, 7, 8], "06000": 3, "hsw": 3, "21": [3, 5, 6, 7], "edward": [3, 5], "j": [3, 5, 7, 8, 9], "yelong": 3, "shen": [3, 5, 8], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 7, 8], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 7, 8], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 8], "shane": [3, 5, 8], "gu": [3, 5, 8], "le": [3, 5, 7], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 5, 7, 8], "jiawei": [3, 9], "2210": [3, 8], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 5, 8], "decemb": [3, 5, 6, 8], "9781098129095": 3, "www": [3, 5, 6, 7, 8], "oreilli": 3, "ksd": 3, "rylan": [3, 5, 8], "schaeffer": [3, 8], "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": [3, 8], "koyejo": [3, 8], "thrive": [3, 5, 9], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 7], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 8], "lawrenc": 3, "sean": [3, 5, 8], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 7], "2407": [3, 5, 7, 8], "21783": [3, 7], "lwx": 3, "lin": [3, 5, 6, 7, 8, 9], "rui": [3, 5, 7, 9], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 7, 8], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 7, 8], "survei": [3, 5, 8, 9], "2406": [3, 5, 6, 7, 8], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 8], "xu": [3, 5, 7, 8], "diogo": [3, 8], "almeida": [3, 8], "carrol": [3, 8], "wainwright": [3, 8], "pamela": [3, 5, 8], "mishkin": [3, 5, 8], "chong": [3, 8], "sandhini": [3, 8], "agarw": [3, 5, 8], "katarina": [3, 8], "slama": [3, 8], "alex": [3, 5, 7, 8], "rai": [3, 5, 7, 8], "john": [3, 5, 6, 8], "hilton": [3, 5, 7, 8], "fraser": [3, 8], "kelton": 3, "miller": [3, 5], "maddi": [3, 8], "simen": [3, 8], "peter": [3, 5, 7, 8], "welind": [3, 5, 8], "paul": [3, 5, 8], "christiano": [3, 8], "leik": [3, 5, 8], "ryan": [3, 5, 8], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 8], "eric": [3, 5, 7, 8], "mitchel": [3, 6, 7], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 8], "chelsea": [3, 8], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 6, 7, 8], "filip": [3, 8], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 8], "radford": [3, 5, 8], "oleg": [3, 8], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 6, 9], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 8], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 7], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": [3, 6], "09539": 3, "tm": [3, 7], "23": [3, 5, 6, 7, 8], "hugo": [3, 7], "loui": [3, 5, 7], "martin": [3, 5, 6, 7, 8], "kevin": [3, 5, 7, 8], "stone": [3, 7], "albert": [3, 7], "amjad": [3, 7], "almahairi": [3, 7], "yasmin": [3, 7], "babaei": [3, 7], "nikolai": [3, 7], "bashlykov": [3, 7], "soumya": [3, 7], "batra": [3, 7], "prajjwal": [3, 7], "bhargava": [3, 7], "shruti": [3, 7], "bhosal": [3, 7], "dan": [3, 5, 7, 8, 9], "bikel": [3, 7], "luka": [3, 7], "blecher": [3, 7], "cristian": [3, 7], "canton": [3, 7], "ferrer": [3, 7], "moya": [3, 7], "guillem": [3, 7], "cucurul": [3, 7], "esiobu": [3, 7], "jude": [3, 7], "fernand": [3, 7], "jeremi": [3, 5, 6, 7], "fu": [3, 7], "wenyin": [3, 7], "brian": [3, 7, 8], "fuller": [3, 7, 8], "cynthia": [3, 7], "gao": [3, 5, 7, 8], "vedanuj": [3, 7], "goswami": [3, 7, 8], "naman": [3, 6, 7], "goyal": [3, 6, 7], "anthoni": [3, 6, 7], "hartshorn": [3, 7], "saghar": [3, 7], "hosseini": [3, 7], "hakan": [3, 7, 8], "inan": [3, 7, 8], "marcin": [3, 7], "karda": [3, 7], "viktor": [3, 7], "kerkez": [3, 7], "madian": [3, 7, 8], "khabsa": [3, 7, 8], "isabel": [3, 7, 8], "kloumann": [3, 7], "artem": [3, 7], "korenev": [3, 7], "punit": [3, 7], "singh": [3, 5, 6, 7], "koura": [3, 7], "mari": [3, 5, 7, 8], "ann": [3, 7, 8], "lachaux": [3, 7], "thibaut": [3, 7], "lavril": [3, 7], "jenya": [3, 7], "diana": [3, 5, 7], "liskovich": [3, 7], "yinghai": [3, 7], "yune": [3, 7, 8], "mao": [3, 4, 7, 8], "xavier": [3, 7], "martinet": [3, 7], "todor": [3, 7, 8], "mihaylov": [3, 7], "pushkar": [3, 7], "mishra": [3, 5, 7], "igor": [3, 5, 7, 8], "molybog": [3, 7], "yixin": [3, 5, 7], "nie": [3, 5, 6, 7], "andrew": [3, 5, 7, 8], "poulton": [3, 7], "reizenstein": [3, 7], "rashi": [3, 7, 8], "rungta": [3, 7, 8], "kalyan": [3, 7], "saladi": [3, 7], "alan": [3, 7, 8], "schelten": [3, 7], "ruan": [3, 7], "silva": [3, 7], "ranjan": [3, 7], "subramanian": [3, 7], "xiaoq": [3, 7], "ellen": [3, 7], "tan": [3, 5, 6, 7], "binh": [3, 7], "ross": [3, 4, 7, 8], "taylor": [3, 7], "adina": [3, 7, 8], "jian": [3, 5, 6, 7], "kuan": [3, 7], "puxin": [3, 7], "yan": [3, 4, 5, 7], "iliyan": [3, 7], "zarov": [3, 7], "yuchen": [3, 5, 7, 8], "angela": [3, 5, 7, 8], "fan": [3, 5, 7], "melani": [3, 7], "kambadur": [3, 7], "sharan": [3, 7], "narang": [3, 7], "aurelien": [3, 7], "rodriguez": [3, 7], "stojnic": [3, 7], "sergei": [3, 7], "edunov": [3, 7], "thoma": [3, 5, 7, 8], "scialom": [3, 7], "2307": [3, 7, 9], "09288": [3, 7], "vaa": [3, 8], "berti": [3, 8], "adarsh": [3, 8], "agraw": [3, 8], "ahm": [3, 8], "victor": [3, 8], "akinwand": [3, 8], "namir": [3, 8], "nuaimi": [3, 8], "najla": [3, 8], "alfaraj": [3, 8], "alhajjar": [3, 8], "aroyo": [3, 8], "trupti": [3, 8], "bavalatti": [3, 8], "max": [3, 5, 6, 8], "bartolo": [3, 8], "borhan": [3, 8], "blili": [3, 8], "hamelin": [3, 8], "kurt": [3, 8], "bollack": [3, 8], "rishi": [3, 5, 7, 8], "bomassani": [3, 8], "marisa": [3, 8], "ferrara": [3, 8], "boston": [3, 8], "sim\u00e9on": [3, 8], "campo": [3, 8], "kal": [3, 8], "chakra": [3, 8], "canyu": [3, 8], "codi": [3, 8], "coleman": [3, 8], "zachari": [3, 5, 8], "delpierr": [3, 8], "coudert": [3, 8], "leon": [3, 8], "derczynski": [3, 8], "debojyoti": [3, 8], "dutta": [3, 8], "ian": [3, 5, 8], "eisenberg": [3, 8], "ezick": [3, 8], "heather": [3, 8], "frase": [3, 8], "ram": [3, 7, 8], "gandikota": [3, 8], "agasthya": [3, 8], "gangavarapu": [3, 8], "ananya": [3, 5, 8], "geali": [3, 8], "rajat": [3, 8], "ghosh": [3, 5, 8], "goel": [3, 5, 8], "usman": [3, 8], "gohar": [3, 8], "sujata": [3, 8], "hale": [3, 8], "wiebk": [3, 8], "hutiri": [3, 8], "marvin": [3, 8], "imperi": [3, 8], "surgan": [3, 8], "jandial": [3, 8], "nick": [3, 5, 8], "judd": [3, 8], "felix": [3, 5, 8], "juefei": [3, 8], "fouts": [3, 8], "khomh": [3, 8], "bhavya": [3, 8], "kailkhura": [3, 8], "hannah": [3, 5, 8], "rose": [3, 8], "kirk": [3, 8], "klyman": [3, 8], "knotz": [3, 8], "kuchnik": [3, 8], "shachi": [3, 8], "kumar": [3, 5, 8], "srijan": [3, 8], "lengerich": [3, 8], "bo": [3, 5, 7, 8], "zeyi": [3, 8], "liao": [3, 5, 8], "eileen": [3, 8], "sarah": [3, 5, 8], "luger": [3, 8], "yifan": [3, 5, 8], "priyanka": [3, 8], "mammen": [3, 8], "kelvin": [3, 6, 8], "manyeki": [3, 8], "mcgregor": [3, 8], "virendra": [3, 8], "mehta": [3, 5, 8], "shafe": [3, 8], "moham": [3, 8], "moss": [3, 8], "lama": [3, 8], "nachman": [3, 8], "dinesh": [3, 8], "jinenh": [3, 8], "naganna": [3, 8], "amin": [3, 8], "nikanjam": [3, 8], "besmira": [3, 8], "nushi": [3, 8], "lui": [3, 5, 8], "oala": [3, 8], "iftach": [3, 8], "orr": [3, 5, 8], "alicia": [3, 5, 8], "parrish": [3, 5, 8], "cigdem": [3, 8], "patlak": [3, 8], "pietri": [3, 8], "forough": [3, 8], "poursabzi": [3, 8], "sangdeh": [3, 8], "eleonora": [3, 8], "presani": [3, 8], "fabrizio": [3, 8], "puletti": [3, 8], "r\u00f6ttger": [3, 8], "sahai": [3, 8], "santo": [3, 8], "nino": [3, 8], "scherrer": [3, 8], "alic": [3, 5, 8, 9], "schoenauer": [3, 8], "sebag": [3, 8], "patrick": [3, 6, 8], "schramowski": [3, 8], "abolfazl": [3, 8], "shahbazi": [3, 8], "vin": [3, 8], "xudong": [3, 5, 6, 8], "vamsi": [3, 8], "sistla": [3, 8], "leonard": [3, 8], "testuggin": [3, 8], "vithursan": [3, 8], "thangarasa": [3, 8], "elizabeth": [3, 5, 8], "watkin": [3, 8], "rebecca": [3, 5, 8], "weiss": [3, 8], "welti": [3, 8], "tyler": [3, 5, 8], "wilber": [3, 8], "jean": [3, 8], "poonam": [3, 8], "yadav": [3, 8], "xianjun": [3, 8], "yang": [3, 5, 6, 7, 8, 9], "yi": [3, 5, 6, 8, 9], "wenhui": [3, 8], "fedor": [3, 8], "zhdanov": [3, 8], "jiacheng": [3, 5, 8], "perci": [3, 5, 8], "liang": [3, 5, 8, 9], "mattson": [3, 8], "joaquin": [3, 8], "vanschoren": [3, 8], "v0": [3, 8], "12241": [3, 8], "wyg": 3, "tianhao": [3, 5, 7, 8], "weizh": 3, "yuan": [3, 5, 8], "olga": 3, "golovneva": 3, "jing": [3, 8], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 8], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": [3, 6], "xie": [3, 5, 8], "mingyuan": 3, "paradigm": [3, 5], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 9], "econom": [4, 5, 6], "fuel": 4, "equival": [4, 5, 7], "consumpt": [4, 5], "contrari": 4, "truth": [4, 5, 7, 8, 9], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 7, 8], "60": [4, 5, 6, 7, 8], "06": [4, 5, 6, 9], "price": [4, 5, 6, 7], "fallen": 4, "62": [4, 5, 7], "introduct": 4, "march": [4, 5, 9], "stem": [4, 5, 9], "compound": 4, "bit": [4, 6, 7], "tune": [4, 5, 8], "dpo": [4, 7], "competit": [4, 5, 6, 7, 8], "plummet": 4, "rapidli": [4, 7, 8, 9], "preciou": 4, "wouldn": [4, 5], "sens": [4, 8], "wait": [4, 5, 8], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 9], "coal": 4, "industri": [4, 5, 6, 7, 8, 9], "made": [4, 5, 6, 7, 9], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5, 6], "repeat": [4, 6], "didn": [4, 9], "smartphon": [4, 5, 6, 7], "server": [4, 5, 7, 9], "network": [4, 5, 6, 7, 9], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5, 6], "hd": 4, "stream": [4, 5, 6, 7, 9], "storag": [4, 5, 7, 8], "gigabyt": 4, "entir": [4, 5, 6, 7, 9], "massiv": [4, 5, 8], "broadli": [4, 6, 7, 9], "audio": [4, 5, 6], "transcript": [4, 6], "multimod": [4, 7, 8], "imag": [4, 5, 6, 7, 8], "exponenti": [4, 5], "growth": [4, 5, 6], "magnifi": 4, "everyth": [4, 9], "billion": [4, 5, 6, 7, 9], "dollar": [4, 5, 7], "annual": [4, 5, 6, 8], "millisecond": [4, 5], "latenc": [4, 5, 7, 8], "30": [4, 5, 6, 7, 8], "mobil": [4, 5, 7, 9], "b": [4, 5, 7, 8, 9], "tradeoff": [4, 7, 8, 9], "pro": [4, 5, 6, 7, 8], "trigger": [4, 6, 8], "premium": [4, 5], "innov": [4, 5, 6, 7, 8], "capac": [4, 5, 6, 7], "link": [4, 5], "dual": 4, "character": [4, 5, 8], "ahead": [4, 7, 8], "decai": [4, 7], "discuss": [4, 5, 7, 8], "area": [4, 5, 8, 9], "flash": [4, 6, 7], "cach": [4, 5, 6, 7], "prompt": [4, 5, 6, 8], "compress": [4, 5, 7], "provis": [4, 5], "extent": [4, 5, 8], "problema": 4, "accomplish": [4, 8, 9], "accompani": [4, 5, 8], "transact": [4, 5, 8], "roi": 4, "alloc": [4, 5, 6, 7, 8], "budget": [4, 7], "viabil": [4, 7], "prioriti": [4, 5, 7], "overlook": [4, 6], "thorough": [4, 7, 8], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8, 9], "longev": 4, "accommod": 4, "evalu": [4, 6, 7, 9], "multi": [4, 5, 6, 7, 8, 9], "baselin": [4, 5, 7, 8], "met": [4, 5, 8], "equal": [4, 5, 6, 8], "concurr": [4, 7], "peak": 4, "spike": 4, "versu": [4, 5, 7, 8], "volum": [4, 5, 7, 8], "season": [4, 5], "variat": [4, 5, 7, 8], "uptim": 4, "mainten": [4, 5, 7, 8], "disrupt": [4, 5, 6], "backup": 4, "failov": 4, "clearli": [4, 5, 8, 9], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 8, 9], "event": [4, 5], "seamless": [4, 5, 8], "broader": [4, 5, 6, 7, 8], "vector": [4, 7, 8], "augment": [4, 5, 7], "rag": [4, 7], "retent": [4, 5, 6], "polici": [4, 5, 6, 7], "essenti": [4, 5, 6, 7, 8, 9], "opportun": [4, 5, 6], "post": [4, 5, 7, 8], "32": [4, 5, 6, 7], "fp32": 4, "fp16": [4, 7], "proport": [4, 5, 7], "byte": 4, "120": [4, 5, 8], "gb": 4, "whole": [4, 5], "done": [4, 5, 7, 8, 9], "smollm2": [4, 5, 7, 9], "135m": [4, 7], "load_gguf": 4, "bartowski": 4, "gguf": [4, 7], "gguf_file_q2_k": 4, "q2_k": [4, 7], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 7, 9], "proxi": [4, 5, 6, 8], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 9], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5], "arrai": [4, 8], "detach": 4, "graph": [4, 5], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 9], "9970": 4, "exemplifi": [4, 6, 7, 8], "70b": [4, 5, 7], "unsloth": 4, "141": 4, "q8_0": [4, 7], "75": [4, 8], "47": [4, 5, 7, 8], "cumul": [4, 5, 6], "26": [4, 5, 7], "19": [4, 5, 6, 7, 8], "space": [4, 5, 7, 8], "counterpart": 4, "spectrum": [4, 5, 6], "variant": [4, 5, 7, 8], "laptop": [4, 5], "desktop": [4, 5, 7], "enterpris": [4, 5, 7, 8, 9], "ceil": 4, "notabl": [4, 5, 8, 9], "bitnet": 4, "cpp": [4, 9], "arm": 4, "x86": 4, "speedup": [4, 7], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 8], "raw": [4, 5, 7, 8, 9], "speed": [4, 5, 7, 8], "energi": [4, 5, 6], "55": [4, 5, 6, 7], "70": [4, 5, 7], "71": [4, 5], "82": [4, 8], "impress": [4, 7, 9], "100b": 4, "b1": 4, "58": [4, 6, 7], "pace": [4, 5, 6, 8], "kernel": 4, "characterist": [4, 5, 7, 8, 9], "excit": [4, 7], "frontier": [4, 8], "compel": [4, 5, 7, 9], "acceler": [4, 5, 7, 8], "faster": [4, 7], "arithmet": [4, 5], "benefici": [4, 5, 7], "sustain": [4, 5, 6, 7, 8], "Be": [4, 5, 7, 8], "fine": [4, 5, 8], "pure": [4, 5, 7, 9], "unlock": [4, 9], "track": [4, 5, 6, 8], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 8], "shaoguang": 4, "shume": [4, 8], "ma": [4, 5, 8], "hongyu": [4, 5], "xia": [4, 5, 7], "infra": 4, "fast": [4, 5, 7, 8, 9], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 7], "2024w": [4, 7], "unsloth24": 4, "jonathan": [4, 5, 8], "ceo": [4, 5], "groq": [4, 7], "streamlin": [4, 5, 6, 7, 9], "notat": 4, "width": [4, 7], "_k": 4, "_0": 4, "matter": [5, 6], "beauti": 5, "smart": [5, 8], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "pivot": [5, 7], "verif": [5, 6, 7, 9], "norm": 5, "realm": 5, "convent": [5, 8], "evolut": [5, 7], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": [5, 6], "ignor": 5, "outdat": [5, 6, 8, 9], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 8], "mindset": 5, "front": [5, 7], "incorpor": [5, 6, 7, 8, 9], "produc": [5, 6, 7, 8, 9], "novel": [5, 7], "ident": [5, 6], "isn": [5, 8], "bug": 5, "random": [5, 8, 9], "testabl": 5, "guarante": [5, 6, 7, 8, 9], "exceedingli": 5, "primari": [5, 8], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8, 9], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 8], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6, 7], "temp_respons": 5, "iterrow": [5, 8], "10000": [5, 6, 9], "appl": [5, 6, 9], "txt": [5, 6, 7, 9], "sec_fil": [5, 9], "nsecur": 5, "AND": [5, 9], "exchang": [5, 6, 8, 9], "commiss": [5, 6, 8, 9], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 8], "13": [5, 6, 7, 8], "OR": 5, "OF": [5, 8], "THE": [5, 8], "1934": 5, "nfor": 5, "fiscal": [5, 6], "septemb": [5, 6], "28": [5, 6, 7, 8], "nor": 5, "period": [5, 6, 8], "ncommiss": 5, "001": [5, 7], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 7, 8], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 8, 9], "n95014": 5, "princip": 5, "offic": [5, 6, 8], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 9], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 6, 9], "llc": [5, 9], "n0": 5, "000": [5, 7, 9], "note": [5, 6, 7, 9], "2025": [5, 6], "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 8], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 9], "shorter": 5, "past": [5, 6, 8], "90": [5, 6, 7, 8], "submit": [5, 7, 8], "electron": 5, "232": 5, "filer": 5, "12b": [5, 8], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 8], "revis": [5, 8], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 8], "prepar": [5, 6, 7, 8], "correct": [5, 6, 8], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 8, 9], "vote": 5, "held": [5, 9], "affili": [5, 9], "29": [5, 7, 8, 9], "last": [5, 6, 8, 9], "quarter": 5, "628": [5, 9], "553": [5, 9], "sole": [5, 8], "disclosur": [5, 6, 7, 8], "director": [5, 7, 8], "date": [5, 6], "exclud": 5, "n15": 5, "115": [5, 9], "823": [5, 9], "outstand": [5, 9], "octob": [5, 9], "18": [5, 6, 7, 8, 9], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 6, 8, 9], "sharehold": [5, 6], "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 7, 8], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 6, 8], "nmine": 5, "ii": [5, 7, 9], "nmarket": 5, "stockhold": 5, "purchas": [5, 6, 8], "n19": 5, "reserv": [5, 6], "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 6, 8], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": [5, 6], "ndirector": 5, "corpor": [5, 6, 8], "nexecut": 5, "ownership": [5, 7], "certain": [5, 6, 8, 9], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": [5, 6], "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6, 7], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7, 8], "macroeconom": [5, 6], "anticip": [5, 6, 8], "intend": [5, 7, 8], "caus": [5, 8, 9], "oblig": [5, 6], "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 7], "ncompani": 5, "manufactur": 5, "tablet": [5, 6, 7], "wearabl": 5, "accessori": 5, "sell": [5, 8], "varieti": [5, 7], "52": [5, 8], "53": [5, 6, 8], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 6, 8, 9], "iphon": [5, 6], "se": [5, 8], "nmac": 5, "maco": [5, 7], "mac": [5, 7], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 6, 9], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 7], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 9], "naccessori": 5, "brand": 5, "third": [5, 6, 7, 8], "parti": [5, 6, 7, 8], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": [5, 6], "napplecar": 5, "portfolio": [5, 6], "applecar": 5, "repair": 5, "coverag": [5, 6, 8], "accident": 5, "damag": [5, 8], "theft": [5, 8], "ncloud": 5, "ndigit": 5, "app": [5, 6, 7], "discov": [5, 7, 8], "download": [5, 7], "music": 5, "podcast": 5, "subscript": [5, 7], "arcad": 5, "sm": 5, "listen": [5, 7], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": [5, 6], "pai": [5, 7], "cashless": 5, "nsegment": 5, "primarili": [5, 6, 8], "geograph": [5, 6, 8], "basi": [5, 7], "segment": [5, 6, 8, 9], "america": [5, 6], "europ": 5, "china": [5, 6, 7, 8], "japan": 5, "rest": [5, 7], "asia": 5, "pacif": 5, "north": [5, 8], "south": 5, "european": [5, 8], "india": 5, "middl": [5, 7, 8], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": [5, 6], "although": [5, 7], "partner": [5, 6, 7, 8], "mid": [5, 6], "resel": [5, 6], "retail": 5, "sale": [5, 6], "indirect": 5, "channel": [5, 8], "cellular": 5, "carrier": 5, "net": [5, 6, 9], "38": [5, 6, 7, 8], "ncompetit": 5, "downward": 5, "pressur": [5, 8], "gross": [5, 8], "cycl": [5, 8], "competitor": [5, 6, 7, 8], "compet": [5, 6, 7], "imit": 5, "infring": [5, 7], "intellectu": [5, 7, 8], "marketplac": [5, 8], "nearli": [5, 7], "reput": [5, 8], "expand": [5, 6, 7, 8], "profit": [5, 6, 8, 9], "illegitim": [5, 8], "collabor": [5, 7, 8], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6, 7], "fluctuat": [5, 6], "commonli": 5, "until": [5, 8, 9], "supplier": 5, "matur": 5, "concentr": [5, 6], "enter": [5, 9], "agreement": [5, 6], "suppli": [5, 6, 9], "renew": [5, 6], "nresearch": 5, "nbecaus": 5, "upon": [5, 6, 8], "flow": [5, 6, 9], "acquisit": [5, 6, 8], "nintellectu": 5, "broad": [5, 6, 7, 9], "patent": 5, "copyright": [5, 7], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 8], "personnel": 5, "pursu": [5, 8], "thousand": [5, 7], "durat": 5, "adequ": [5, 8], "nin": 5, "holidai": [5, 8], "fill": 5, "inventori": 5, "older": [5, 7], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7, 8], "talent": [5, 6], "member": [5, 8], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 6, 8], "awai": [5, 6, 8], "ngrowth": 5, "career": 5, "leadership": [5, 8], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 7, 8, 9], "everyon": [5, 7], "gaug": 5, "sentiment": [5, 6, 7, 9], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7, 8], "amend": 5, "sec": [5, 6, 9], "Such": [5, 8], "charg": 5, "investor": [5, 6, 9], "aspx": 5, "websit": [5, 7, 8], "environment": [5, 8], "referenc": [5, 6], "inact": 5, "textual": 5, "unknown": [5, 6, 8], "advers": 5, "conjunct": 5, "consolid": [5, 6], "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 9], "nadvers": 5, "slow": 5, "recess": 5, "unemploy": [5, 6], "inflat": [5, 6], "tighter": 5, "currenc": [5, 6], "monetari": 5, "contract": [5, 7], "logist": 5, "instabl": [5, 8], "inabl": [5, 6], "financ": [5, 7, 8], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": [5, 6], "fair": [5, 8], "instrument": 5, "polit": [5, 8], "disput": 5, "geopolit": 5, "tension": [5, 8], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": [5, 6], "korea": 5, "vietnam": 5, "restrict": [5, 7, 8, 9], "tariff": 5, "export": [5, 6], "portion": [5, 7], "revenu": [5, 6, 9], "restructur": 5, "ceas": 5, "escal": [5, 8], "nmani": 5, "prone": [5, 8], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 8], "attack": [5, 8], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 6, 8], "labor": 5, "nsuch": 5, "imposs": [5, 7], "slowdown": 5, "outag": 5, "neg": [5, 6, 8, 9], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 7, 8], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": 5, "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 8], "minor": [5, 6, 8], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "finish": [5, 8], "destin": 5, "prepay": 5, "termin": [5, 7], "recover": 5, "exposur": [5, 8], "nfutur": 5, "semiconductor": 5, "suffer": [5, 8], "constrain": [5, 7, 9], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 8], "expos": [5, 6, 8], "widespread": [5, 8], "vulner": [5, 6, 8], "compromis": [5, 7, 8], "claim": [5, 6, 7, 8], "intang": 5, "lost": [5, 6, 8], "cancel": 5, "obsolet": 5, "exce": [5, 8], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 6, 8, 9], "carri": [5, 7, 9], "incur": 5, "unpredict": [5, 8], "obsolesc": 5, "forecast": [5, 6, 8], "incorrectli": [5, 8, 9], "extens": [5, 7, 9], "issuanc": 5, "unknowingli": [5, 8], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": [5, 6], "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 8], "storefront": 5, "safari": 5, "union": [5, 8], "eu": [5, 6, 8], "dma": [5, 6], "narrow": [5, 7, 8], "scope": [5, 6, 7, 8], "elimin": [5, 7], "nfailur": 5, "appeal": [5, 6], "subscrib": [5, 6], "nsome": 5, "manner": [5, 6, 8], "nurtur": 5, "nmuch": 5, "chief": [5, 6], "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 7], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 8], "ineffect": 5, "thing": [5, 9], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 8], "confidenti": [5, 7], "encrypt": 5, "But": [5, 8, 9], "behalf": 5, "normal": [5, 6, 8, 9], "investig": [5, 8], "penalti": [5, 7], "frequenc": [5, 7, 8], "actor": [5, 8], "circumv": [5, 8], "obfusc": 5, "forens": 5, "hinder": [5, 9], "recov": 5, "perpetr": 5, "profil": [5, 7], "authent": 5, "hack": [5, 8], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 6, 9], "usernam": 5, "turn": [5, 6, 8, 9], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6, 7], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": [5, 6], "arisen": 5, "ordinari": 5, "cours": [5, 7, 8], "resolv": [5, 7, 8], "sometim": 5, "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": [5, 6], "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 7], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 7, 8], "settl": 5, "uncertain": [5, 6], "disgorg": 5, "remedi": [5, 8], "worldwid": 5, "antitrust": [5, 6], "bill": [5, 6], "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": [5, 6], "repatri": 5, "launder": 5, "tax": [5, 6], "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 7, 8, 9], "agent": [5, 7, 8], "nregulatori": 5, "ban": [5, 8], "nexpect": 5, "increasingli": [5, 7, 8, 9], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7, 8], "lawsuit": [5, 7], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7, 8], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8, 9], "pend": 5, "inquiri": [5, 8], "government": 5, "entiti": [5, 7, 8, 9], "biometr": 5, "notif": 5, "permit": [5, 7, 9], "healthcar": [5, 6, 7], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 8], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 8], "worsen": 5, "A": [5, 6, 7, 8, 9], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 8], "singapor": 5, "organis": 5, "statutori": 5, "valuat": [5, 6], "defer": 5, "bodi": [5, 8], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": [5, 6], "board": [5, 6, 8], "unresolv": 5, "nnone": 5, "threat": [5, 6, 8], "postur": 5, "25": [5, 6, 7, 8], "2016": 5, "coordin": [5, 8], "committe": [5, 8], "oversight": [5, 8], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 9], "center": [5, 8, 9], "formal": [5, 8, 9], "conclud": [5, 7], "uninstal": 5, "web": [5, 6, 7, 8], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 7], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 8], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 8], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 8], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 8], "enjoin": 5, "extern": [5, 6, 8], "januari": 5, "motion": 5, "oppos": [5, 8], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 8], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 6, 8, 9], "nunder": 5, "njune": 5, "august": [5, 6, 8], "nopen": 5, "negoti": [5, 8], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6, 7], "t42": 5, "910": 5, "t221": 5, "39": [5, 6, 7], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 7], "ntotal": [5, 8], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 7, 8], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 7], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 7, 8], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 6, 8], "renminbi": 5, "yen": [5, 9], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 8], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 8], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 8], "amort": 5, "bundl": 5, "flat": [5, 6], "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 8], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 6, 8], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6, 7], "escrow": 5, "ncapit": 5, "95": [5, 8], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 6, 8], "740": 5, "reconcili": [5, 6], "reconcil": [5, 9], "disaggreg": 5, "prospect": 5, "novemb": [5, 8], "07": [5, 6, 8, 9], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 9], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": [5, 6], "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": 5, "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 7, 9], "343": [5, 8], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 7, 8], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 8], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 7], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": [5, 6], "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 8], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": [5, 6], "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6, 7], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6, 7], "94": [5, 6, 7, 8], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 7], "656": 5, "513": 5, "76": [5, 8], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": [5, 6], "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 8], "234": [5, 8], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 8], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": 5, "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7, 8], "nbasi": 5, "prior": [5, 8], "reclassifi": 5, "nrevenu": 5, "remit": [5, 8], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 6, 8], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 7], "64": [5, 7, 8], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 8], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7, 8], "359": [5, 8], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 7], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": [5, 6], "classifi": [5, 8], "37": [5, 7, 8], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 8], "ntrade": 5, "41": [5, 7, 8], "44": [5, 8], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 7], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 7, 8], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 6, 8], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": 5, "2015": 5, "minist": 5, "juli": [5, 8], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": [5, 6], "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 6, 8], "t84": 5, "428": 5, "603": 5, "483": [5, 8], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 6, 8], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 8], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 8], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 8], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 8], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 8], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": [5, 6], "65": [5, 8], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": [5, 6], "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 7, 8], "holder": [5, 7], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 8], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7, 8], "101": [5, 8], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 8], "456": 5, "78": [5, 7, 8], "59": [5, 8], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 7], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8, 9], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": [5, 6], "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 8], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": [5, 6], "advisor": 5, "ernst": 5, "llp": 5, "auditor": [5, 6], "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 7, 8, 9], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 8, 9], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": [5, 6], "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8, 9], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 7, 9], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 7, 8], "05": [5, 6], "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 7], "250": [5, 8], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": [5, 6], "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 7, 8], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 9], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 7], "turbo": [5, 7, 9], "outlin": [5, 7, 8], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 7, 9], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 9], "radic": 5, "grappl": 5, "safer": [5, 8], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8, 9], "aren": [5, 7], "linear": 5, "absent": [5, 8], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 8], "contend": 5, "rethink": [5, 8], "tutor": 5, "children": [5, 8], "verifi": [5, 6, 7, 9], "predefin": [5, 9], "weren": 5, "kind": [5, 6], "usual": [5, 9], "quantif": 5, "contamin": [5, 8], "unseen": [5, 8], "longitudin": 5, "mostli": [5, 9], "latter": 5, "tailor": [5, 8], "great": [5, 7, 8, 9], "cognit": 5, "misinform": [5, 8], "fabric": [5, 8], "citat": 5, "tempor": [5, 6], "disclaim": 5, "referr": 5, "incorrect": [5, 8], "demograph": [5, 8], "stereotyp": [5, 8], "societ": [5, 8], "pii": [5, 8], "anonym": 5, "leakag": [5, 8], "carryov": 5, "fallaci": 5, "think": [5, 7, 8], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": [5, 6], "compat": [5, 6, 7, 9], "overconfid": [5, 6], "clariti": [5, 6, 8, 9], "audienc": 5, "densiti": 5, "satisfact": [5, 9], "misus": [5, 8], "moral": 5, "co2": 5, "etc": [5, 6, 9], "palm": [5, 7], "easi": [5, 6, 7, 8], "synthet": [5, 7, 8, 9], "templat": [5, 6, 9], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 8], "built": [5, 7, 8, 9], "experiment": [5, 7, 9], "vi": 5, "categor": [5, 7, 8, 9], "intrins": [5, 7], "extrins": 5, "perplex": [5, 7], "downstream": [5, 9], "synthesi": 5, "discret": 5, "prefix": [5, 8], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": [5, 6], "favor": [5, 7, 9], "breviti": 5, "insensit": 5, "semant": [5, 6, 9], "orient": [5, 8], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 8], "computation": [5, 6], "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": [5, 6], "corpu": [5, 6, 7], "ter": 5, "edit": [5, 8], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 8], "bert": 5, "spice": 5, "proposit": [5, 7], "scene": [5, 6, 8], "analyst": [5, 6], "rouge_1": 5, "rouge_2": 5, "ideal": [5, 7, 8, 9], "setup": [5, 7, 8, 9], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 9], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 8], "sentence1": 5, "cat": [5, 8], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 7], "element": [5, 8, 9], "verbos": [5, 7, 8, 9], "peripher": 5, "quit": [5, 6, 7, 9], "convei": 5, "breadth": 5, "Of": [5, 7, 8], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "plot": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 9], "figurecanvasagg": 5, "largest": [5, 7], "sarmah": 5, "granular": [5, 6, 7], "likert": 5, "ensembl": 5, "repeatedli": [5, 6], "fluenci": 5, "refin": 5, "integ": [5, 9], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 7, 9], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 6, 8, 9], "z": 5, "w": [5, 6, 7, 8], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6, 7], "trillion": [5, 7, 9], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 8], "lowest": [5, 7], "firstli": 5, "overhead": [5, 7], "egocentr": 5, "tight": 5, "medicin": [5, 8], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "golden": 5, "earlier": [5, 8], "depict": [5, 8, 9], "multilingu": [5, 7, 8], "arena": 5, "randomli": 5, "customiz": [5, 7, 8], "irrelev": 5, "unhelp": [5, 8], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 7], "critiqu": [5, 8], "elo": 5, "exam": 5, "probe": [5, 8], "certifi": 5, "glue": 5, "entail": [5, 7], "superglu": 5, "successor": 5, "grew": 5, "big": [5, 7], "bench": [5, 7], "srivastava": 5, "truthfulqa": [5, 7], "multitask": 5, "hendryck": [5, 8], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 7, 8, 9], "humanev": [5, 7], "lmsy": 5, "brought": 5, "dialogu": [5, 7], "chiang": 5, "gather": 5, "hundr": [5, 7], "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 7], "contributor": [5, 7, 9], "western": 5, "centric": 5, "divid": [5, 6, 8], "subset": [5, 8], "agnost": 5, "dialect": 5, "render": [5, 8], "crowdsourc": 5, "livebench": 5, "white": [5, 8], "resili": [5, 6, 8], "meaningfulli": 5, "satur": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 9], "2x2": 5, "6x6": 5, "shot": [5, 8, 9], "reductio": 5, "ad": [5, 6, 7, 9], "absurdum": 5, "hard": [5, 6], "10b": 5, "counterfactu": 5, "mileston": [5, 7], "came": 5, "arc": 5, "prize": [5, 8], "chollet": 5, "mike": [5, 6, 8], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 7], "kera": 5, "genuin": 5, "agi": 5, "possess": [5, 6], "elementari": 5, "novelti": 5, "interpol": 5, "synthes": 5, "fly": 5, "brute": 5, "pixel": 5, "color": [5, 6], "unbeaten": 5, "win": [5, 7], "takeawai": 5, "vertic": [5, 8], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": [5, 8], "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 7], "liter": 5, "disturb": 5, "zero": [5, 7, 8, 9], "varianc": [5, 8], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "wish": 5, "bivari": 5, "evaluationtrack": 5, "pipelineparamet": 5, "cache_dir": 5, "max_sampl": 5, "basemodelconfig": 5, "evaluation_track": 5, "model_config": 5, "parallelismmanag": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": [5, 6], "timedelta": [5, 6], "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "float16": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": [5, 6], "vllm": [5, 9], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7, 8], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8, 9], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 7], "command": [5, 6, 7], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 7, 9], "alibaba": [5, 7, 9], "5b": [5, 7, 9], "hui": [5, 7], "allal": [5, 7], "cluster": 5, "noteworthi": [5, 7], "superior": [5, 8], "grain": [5, 7, 9], "salt": [5, 9], "modular": 5, "offici": 5, "revisit": 5, "langchain": [5, 6], "trace": 5, "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7, 8], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 7], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8, 9], "tobia": [5, 9], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 7], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 8], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 8], "insert": [5, 6], "combined_df": 5, "concat": [5, 8], "ignore_index": [5, 8], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 7], "groupbi": [5, 8], "agg": [5, 8], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 6, 8], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": 5, "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 8], "04": [5, 7], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 8], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 8], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "ye": [5, 7, 8, 9], "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 7], "num_request": 5, "numrequest": 5, "num": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 7, 9], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 8], "poorli": 5, "eas": [5, 7, 8, 9], "hf": [5, 7], "plain": [5, 6, 7], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 8], "organiz": 5, "alb": [5, 7], "loubna": [5, 7], "anton": [5, 7], "lozhkov": [5, 7], "bakouch": [5, 7], "gabriel": [5, 7, 8], "mart\u00edn": [5, 7, 8], "bl\u00e1zquez": [5, 7], "lewi": [5, 6, 7], "tunstal": [5, 7], "agust\u00edn": [5, 7], "piquer": [5, 7], "andr": [5, 6, 7], "marafioti": [5, 7], "cyril": [5, 7], "zakka": [5, 7], "leandro": [5, 7], "werra": [5, 7], "wolf": [5, 7], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 8], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 6, 8, 9], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 8], "tworek": [5, 8], "heewoo": [5, 8], "jun": [5, 8], "qime": [5, 8], "henriqu": [5, 8], "pond": [5, 8], "de": [5, 8], "oliveira": [5, 8], "pinto": [5, 8], "harri": [5, 8], "yuri": 5, "burda": 5, "greg": [5, 8], "brockman": [5, 8], "raul": [5, 8], "puri": [5, 8], "gretchen": [5, 8], "krueger": [5, 8], "petrov": [5, 8], "heidi": 5, "khlaaf": 5, "girish": [5, 8], "sastri": [5, 8], "brook": [5, 8], "chan": [5, 8], "grai": [5, 8], "ryder": [5, 8], "mikhail": [5, 8], "pavlov": [5, 8], "alethea": [5, 8], "lukasz": 5, "kaiser": [5, 8], "mohammad": [5, 8], "bavarian": [5, 8], "clemen": [5, 8], "winter": [5, 8], "philipp": 5, "tillet": [5, 8], "felip": [5, 8], "petroski": [5, 8], "dave": [5, 8], "cum": [5, 8], "plappert": 5, "fotio": 5, "chantzi": [5, 8], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 8], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 8], "nikola": [5, 8], "tezak": [5, 8], "babuschkin": [5, 8], "suchir": [5, 8], "balaji": [5, 8], "shantanu": [5, 8], "jain": [5, 8], "hess": [5, 8], "carr": 5, "josh": [5, 8], "achiam": [5, 8], "vedant": 5, "misra": 5, "evan": [5, 7, 8], "morikawa": [5, 8], "matthew": 5, "knight": [5, 8], "mile": [5, 8], "brundag": [5, 8], "mira": [5, 8], "murati": [5, 8], "kati": [5, 8], "mayer": [5, 8], "bob": [5, 8, 9], "mcgrew": [5, 8], "ilya": [5, 8], "sutskev": [5, 8], "wojciech": [5, 8], "zaremba": [5, 8], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 8], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 8], "kannappan": [5, 8], "qian": [5, 8], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fac24a": 5, "wiki": [5, 9], "fac24b": 5, "fac24c": 5, "model_doc": 5, "fac24d": 5, "cookbook": 5, "llm_judg": 5, "fac24f": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 8], "chilton": 5, "aditya": [5, 8], "narayana": 5, "chohla": 5, "brandon": [5, 8, 9], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 8], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 8], "joe": [5, 8], "nudel": 5, "joel": [5, 8], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 7], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": [5, 8], "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 8], "basart": [5, 8], "zou": [5, 8], "manta": [5, 8], "mazeika": [5, 8], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 9], "degener": 5, "1904": 5, "09751": 5, "hyc": [5, 7], "binyuan": [5, 7], "zeyu": [5, 7], "cui": [5, 7], "jiaxi": [5, 7], "dayiheng": [5, 7], "tianyu": [5, 7], "jiajun": [5, 7], "kai": [5, 7, 8], "dang": [5, 7], "coder": [5, 7], "preprint": [5, 7, 9], "2409": [5, 7, 8], "12186": [5, 7], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": 5, "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 8], "cosgrov": 5, "acosta": 5, "nava": [5, 8], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": [5, 6], "ren": [5, 7], "huaxiu": 5, "yao": [5, 8, 9], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": 5, "khattab": 5, "chi": [5, 8, 9], "sang": [5, 8], "shibani": [5, 8], "santurkar": [5, 8], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 7, 8], "stephani": [5, 7, 8], "owain": [5, 7, 8], "mimic": [5, 7, 8], "falsehood": [5, 7, 8], "2109": [5, 7, 8], "07958": [5, 7, 8], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 8], "gorilla": 5, "15334": 5, "pro24": 5, "dev": 5, "ras24": 5, "sebastian": [5, 6], "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 8], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": 5, "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 8], "ng": [5, 8], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 9], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 6, 8], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": 5, "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 7], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 8, 9], "kocurek": 5, "ali": [5, 8], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 8], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 7, 8], "loe": 5, "barret": [5, 8], "zoph": [5, 8], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 7], "raffel": [5, 7], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 8], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 8], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 8], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 8], "tam": [5, 9], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 8], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 8], "parascandolo": [5, 8], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 7], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 8], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 6, 8], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 7, 8], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 8], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 6, 8], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 8], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 8], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 8], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "maarten": [5, 8], "bosma": 5, "sap": [5, 8], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": 5, "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 8], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 8], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 8], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": 5, "varma": 5, "nanyun": 5, "peng": [5, 8], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": 5, "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 8], "shirish": [5, 8], "keskar": [5, 8], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": [5, 6], "oliv": [5, 8], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": 5, "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": 5, "pezeshkpour": 5, "priti": 5, "oli": 5, "qiaozhu": 5, "mei": [5, 7], "qing": [5, 8], "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 8], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 7], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 8], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 8], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": [5, 8], "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 8], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 8], "shieber": 5, "summer": [5, 8], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 8], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 8], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": [5, 6], "srikumar": [5, 6], "fedu": [5, 8], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 8], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": 5, "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 8], "donald": 5, "metzler": 5, "ed": 5, "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 8], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 8], "ziv": 5, "khalid": [5, 7], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 7], "chengpeng": 5, "chengyuan": [5, 7], "fei": [5, 7], "guant": 5, "haoran": [5, 7], "huan": [5, 7], "jialong": 5, "jialin": 5, "jianhong": [5, 7], "tu": [5, 7], "jianwei": [5, 7], "jianxin": [5, 7], "jin": [5, 6, 8], "jingren": [5, 7], "jinz": 5, "jinzheng": 5, "junyang": [5, 7], "keme": [5, 7], "keqin": [5, 7], "kexin": [5, 7], "mingfeng": [5, 7], "xue": [5, 7, 8], "ni": [5, 6], "pei": [5, 7, 8], "ru": 5, "men": [5, 7], "ruiz": 5, "runji": [5, 7], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 7], "xinyu": [5, 8], "xipin": 5, "xuancheng": [5, 7], "yichang": [5, 7], "wan": [5, 7], "yunfei": 5, "yuqiong": [5, 7], "zhenru": [5, 7], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": [5, 6], "forum": [5, 6], "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 8], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 8], "2306": [5, 8], "05685": 5, "huggingface24": 5, "metaai24": 5, "422": 5, "thank": [5, 7, 9], "doubl": 6, "steve": [6, 8], "lclm": 6, "simultan": [6, 7, 8], "cutoff": 6, "amayuela": 6, "tail": 6, "kotha": 6, "unifi": [6, 7, 9], "codebas": [6, 7], "ingest": 6, "preprocess": [6, 7, 9], "parser": [6, 9], "microsoft": [6, 7], "autogen": 6, "powerpoint": 6, "ocr": 6, "exif": 6, "metadata": [6, 7], "docker": [6, 7], "container": [6, 7], "xlsx": 6, "text_cont": 6, "ibm": [6, 7, 8], "docx": 6, "pptx": 6, "layout": 6, "llamaindex": 6, "document_convert": 6, "documentconvert": 6, "export_to_markdown": 6, "presenc": 6, "merril": 6, "lynch": 6, "cio": 6, "outlook": 6, "forecast_file_path": 6, "result_md": 6, "forecast_result_docl": 6, "levenshtein": 6, "distanc": 6, "sequencematch": 6, "difflib": 6, "longest": 6, "levenshtein_similar": 6, "text1": 6, "text2": 6, "max_len": 6, "simple_similar": 6, "ratio": [6, 7], "forecast_result_md": 6, "13985705461925346": 6, "17779960707269155": 6, "readabl": 6, "messi": 6, "2025e": 6, "compos": [6, 7, 8], "financial_vari": 6, "financial_forecast": 6, "econforecast": 6, "extract_prompt": 6, "base_prompt": [6, 9], "extract_from_doc": 6, "twice": 6, "md_financi": 6, "docling_financi": 6, "easier": [6, 7, 8, 9], "gdp": 6, "cpi": 6, "fed": 6, "df_md_forecast": 6, "df_docling_forecast": 6, "despit": [6, 7, 9], "underweight": 6, "neutral": [6, 8], "overweight": 6, "chart": 6, "asset_class_docl": 6, "asset_class_md": 6, "df_md": 6, "df_docl": 6, "true_valu": 6, "df_comparison": 6, "cap": 6, "exempt": 6, "markitdown_accuraci": 6, "docling_accuraci": 6, "93": [6, 7, 8], "unstructur": [6, 7, 9], "sector": 6, "convert_and_export_t": 6, "file_path": 6, "doc_convert": 6, "start_tim": [6, 8], "conv_r": 6, "table_df": 6, "export_to_datafram": 6, "end_tim": 6, "2f": 6, "usd": 6, "wtd": 6, "mtd": 6, "ytd": 6, "djia": 6, "926": 6, "amp": 6, "051": 6, "277": 6, "russel": [6, 8], "2000": 6, "msci": 6, "817": [6, 8], "eaf": 6, "319": 6, "107": 6, "01": [6, 7], "66": [6, 8], "92": 6, "municip": 6, "79": [6, 8], "slight": 6, "discretionari": 6, "yellow": 6, "estat": 6, "orang": 6, "stapl": 6, "constructor": 6, "md_llm": 6, "llm_client": 6, "llm_model": 6, "png": 6, "overview": [6, 9], "showcas": 6, "bond": 6, "crude": 6, "oil": 6, "sit": 6, "648": 6, "ounc": 6, "euro": 6, "tactic": 6, "bofa": 6, "circl": [6, 8], "fetch": 6, "reassembl": 6, "max_output_token": 6, "statement": [6, 8], "10k": 6, "diagram": [6, 8], "charactertextsplitt": 6, "tiktoken": [6, 8], "sequenti": 6, "newlin": 6, "cheap": 6, "speciali": 6, "nltk": 6, "spaci": 6, "hierarch": [6, 8], "talk": 6, "theme": [6, 7, 8], "splitter": 6, "surpass": 6, "get_chunk": 6, "chunk_siz": 6, "chunk_overlap": 6, "langchain_text_splitt": 6, "text_splitt": 6, "from_tiktoken_encod": 6, "split_text": 6, "persona": 6, "langchain_cor": [6, 9], "prompttempl": 6, "get_base_prompt_templ": 6, "from_templ": 6, "llmchain": 6, "output_pars": 6, "stroutputpars": 6, "langchain_commun": 6, "chat_model": 6, "chatlitellm": 6, "get_llm_chain": 6, "prompt_templ": [6, 9], "llm_chain": [6, 9], "api_key_label": 6, "upper": 6, "_api_kei": 6, "get_dynamic_prompt_templ": 6, "get_dynamic_prompt_param": 6, "prompt_param": 6, "part_idx": 6, "total_part": 6, "chat_context": 6, "param": 6, "dynamic_prompt_param": 6, "concaten": 6, "generate_report": 6, "input_cont": 6, "llm_model_nam": 6, "report_part": 6, "num_part": 6, "dinam": 6, "priovid": 6, "invok": [6, 9], "cummul": 6, "max_chunk_s": 6, "max_chunk_overlap": 6, "gemini": [6, 7], "apple_report": 6, "report_cont": 6, "report_lin": 6, "splitlin": 6, "total_lin": 6, "quarter_lin": 6, "top_port": 6, "bottom_port": 6, "uncov": [6, 8, 9], "delv": 6, "consol": 6, "fewer": [6, 7, 8], "reaction": 6, "breakdown": [6, 8], "disciplin": 6, "appar": [6, 8], "subhead": 6, "depth": [6, 8], "2m": [6, 7], "harvard": [6, 7], "enrol": 6, "gov": [6, 8], "1039": 6, "birth": [6, 8], "democraci": 6, "tuesdai": 6, "magna": 6, "carta": 6, "trudg": 6, "dens": 6, "conversation": 6, "knowledge_bas": 6, "add_knowledge_bas": 6, "add_cit": 6, "bool": [6, 8], "num_quest": 6, "input_memori": 6, "response_memori": 6, "urls_memori": 6, "extractor": 6, "cic": 6, "citabl": 6, "passag": [6, 8], "corpora": 6, "formatted_cont": 6, "reference_id": 6, "wrapper": [6, 9], "content_gener": 6, "user_instruct": 6, "llmbackend": 6, "cache_ttl": 6, "cachedcont": 6, "display_nam": 6, "due_knowledge_bas": 6, "system_instruct": 6, "compose_prompt": 6, "conversation_config": 6, "ttl": 6, "generativemodel": 6, "from_cached_cont": 6, "cached_cont": 6, "behind": [6, 8], "quiz_inst": 6, "professor": 6, "difficulti": [6, 8], "dataset": [6, 9], "syllabu": 6, "kennedi": 6, "inaugur": 6, "lincoln": 6, "gettysburg": 6, "liberti": 6, "mayflow": 6, "abraham": 6, "gutenberg": 6, "kb": 6, "epub": 6, "pg": 6, "gemini_duo": 6, "genai_duo": 6, "duo": 6, "usage_metadata": 6, "38470": 6, "anytim": 6, "shap": 6, "mckechni": 6, "study_refer": 6, "pg10000": 6, "65363": 6, "pg65363": 6, "quizz": 6, "problemat": [6, 8], "misinterpret": 6, "awp": 6, "alfonso": 6, "liangm": 6, "pan": [6, 8], "wenhu": 6, "lun": 6, "ku": 6, "editor": [6, 8], "acl": [6, 8], "6416": 6, "6432": 6, "bangkok": 6, "thailand": 6, "aclanthologi": [6, 8], "383": 6, "18653": [6, 8], "v1": [6, 7, 8], "ksr24": 6, "suha": 6, "springer": 6, "aditi": 6, "raghunathan": 6, "twelfth": 6, "vrhif2hsrm": 6, "lcd": 6, "jinhyuk": 6, "zhuyun": 6, "dheeru": 6, "dua": 6, "devendra": 6, "sachan": 6, "boratko": 6, "luan": 6, "s\u00e9bastien": 6, "arnold": 6, "vincent": 6, "perot": 6, "siddharth": 6, "dalmia": 6, "hexiang": 6, "panupong": 6, "pasupat": 6, "aida": 6, "amini": 6, "cole": 6, "riedel": 6, "iftekhar": 6, "naim": 6, "ming": [6, 8], "guu": 6, "subsum": 6, "sql": 6, "13121": 6, "lpp": 6, "aleksandra": 6, "piktu": 6, "fabio": [6, 8], "petroni": 6, "vladimir": 6, "karpukhin": 6, "heinrich": 6, "k\u00fcttler": 6, "tau": 6, "yih": 6, "rockt\u00e4schel": 6, "douw": 6, "kiela": 6, "2005": 6, "11401": 6, "nbgc24": 6, "shiyu": 6, "kepe": 6, "bi": 6, "jiafeng": 6, "guo": [6, 8], "xueqi": 6, "cheng": [6, 8, 9], "11375": 6, "11388": 6, "675": 6, "tdw": 6, "jiejun": 6, "zhicheng": 6, "dou": 6, "mang": 6, "weipeng": 6, "ji": 6, "htmlrag": 6, "02959": 6, "zlj": 6, "jiaji": 6, "yun": [6, 9], "metacognit": 6, "1453": 6, "1463": 6, "ny": [6, 8, 9], "usa": [6, 8, 9], "machineri": [6, 9], "1145": [6, 8, 9], "3589334": 6, "3645481": 6, "anthropic4a": 6, "langchain24": 6, "how_to": 6, "merrilllynch24": 6, "weekli": 6, "olui2": 6, "gwmol": 6, "di": 7, "hunter": 7, "photo": 7, "email": 7, "hipaa": 7, "properti": [7, 8], "gdpr": 7, "strict": [7, 8, 9], "iot": 7, "unreli": 7, "impract": 7, "slm": 7, "viabl": 7, "sensor": 7, "interconnect": 7, "frontend": 7, "garner": 7, "yourself": 7, "aw": [7, 8], "bedrock": 7, "sambanova": 7, "sla": 7, "veloc": 7, "roadmap": 7, "commodit": 7, "winner": 7, "loser": 7, "condens": 7, "clean": 7, "2024t": 7, "versatil": 7, "72b": 7, "med": 7, "bloomberggpt": 7, "underw": 7, "adept": 7, "toxigen": 7, "alnajjar": 7, "13b": [7, 8], "outperform": 7, "32b": 7, "feasibl": 7, "modal": 7, "diagnosi": 7, "patient": 7, "necessit": 7, "deepseek": 7, "flagship": 7, "405b": 7, "pack": 7, "v3": [7, 8], "671": 7, "moe": 7, "mixtur": 7, "3x": [7, 8], "v2": [7, 8], "fraction": 7, "domin": 7, "cautiou": 7, "cautious": 7, "isol": [7, 8], "cpot": 7, "cpit": 7, "tco": 7, "tpot": 7, "ttft": 7, "sent": [7, 8], "gpqa": 7, "median": 7, "afford": 7, "meanwhil": 7, "lite": 7, "micro": 7, "encod": [7, 8, 9], "cent": 7, "1m": 7, "cheapest": 7, "phi": 7, "half": [7, 8], "permiss": [7, 8], "apach": 7, "simpler": [7, 9], "700m": 7, "100m": 7, "gemma": [7, 9], "grown": 7, "withdraw": 7, "incomplet": [7, 8], "unclear": 7, "15t": 7, "8t": 7, "fineweb": 7, "penedo": 7, "96": [7, 8], "crawl": 7, "snapshot": 7, "ablat": 7, "vital": [7, 8], "favorit": 7, "spawn": 7, "ultrachat": 7, "2024u": 7, "created_job": 7, "fine_tun": 7, "training_fil": 7, "file_id": 7, "ultrachat_chunk_train": 7, "validation_fil": 7, "ultrachat_chunk_ev": 7, "training_step": 7, "0001": 7, "auto_start": 7, "job_id": 7, "toolkit": [7, 8], "sft": 7, "nemo": [7, 8], "codestr": 7, "2024v": 7, "enough": 7, "rewrit": 7, "smolvlm": 7, "mlx": [7, 9], "mlc": 7, "peft": 7, "programm": 7, "graphic": [7, 8], "vram": 7, "mathbf": 7, "x_1": [7, 9], "x_2": [7, 9], "x_n": [7, 9], "x_": [7, 9], "\u03b8": 7, "matrix": [7, 8], "cerebra": 7, "mozilla": 7, "gerganov": 7, "georgi": 7, "overwhelm": [7, 9], "manifesto": 7, "enjoy": 7, "bog": 7, "exploratori": 7, "hacker": 7, "Will": [7, 8], "prototyp": 7, "prematur": 7, "besid": 7, "lighter": 7, "sacrific": 7, "ggml": [7, 9], "disk": 7, "backward": 7, "2024x": 7, "repo": 7, "compil": 7, "linux": 7, "argument": [7, 8, 9], "sudo": 7, "apt": 7, "cmake": 7, "bind": 7, "betlen": 7, "cnv": 7, "llamacpp": 7, "succinct": 7, "ctrl": 7, "interject": 7, "philosoph": 7, "debat": 7, "fulfil": 7, "happi": 7, "responsibli": 7, "bye": 7, "goodby": 7, "port": 7, "127": 7, "curl": [7, 9], "localhost": 7, "bearer": 7, "finish_reason": 7, "deepli": 7, "1734627879": 7, "completion_token": 7, "total_token": 7, "chatcmpl": 7, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 7, "prompt_n": 7, "prompt_m": 7, "132": 7, "prompt_per_token_m": 7, "prompt_per_second": 7, "77619878666999": 7, "predicted_n": 7, "predicted_m": 7, "1700": 7, "654": [7, 9], "predicted_per_token_m": 7, "36882142857143": 7, "predicted_per_second": 7, "92850867960208": 7, "gbnf": [7, 9], "8pm": 7, "appointmenttim": 7, "appointmentdetail": 7, "handi": 7, "model_path": 7, "llama_cpp": 7, "create_chat_complet": 7, "occupi": 7, "activist": 7, "justin": [7, 8], "tunnei": 7, "ocho": 7, "appach": 7, "cosmopolitan": 7, "libc": 7, "portabl": 7, "durabl": 7, "usabl": [7, 8, 9], "tinyllama": 7, "wget": 7, "jartin": 7, "q5_k_m": 7, "renam": 7, "ex": 7, "chmod": 7, "nobrows": 7, "registri": 7, "nativ": [7, 9], "trai": 7, "familiar": 7, "bare": 7, "ssfl": 7, "sh": [7, 9], "Or": 7, "11434": 7, "chatrespons": 7, "easiest": 7, "rich": [7, 8], "playground": 7, "importantli": [7, 9], "intuit": 7, "beginn": 7, "tensorrt": 7, "trt": 7, "latex": 7, "voic": 7, "pwa": 7, "medium": [7, 8], "gpt4all": 7, "rbac": 7, "q4_k": 7, "q6_k": 7, "mib": 7, "wikitext": 7, "salesforc": 7, "wikipedia": [7, 9], "min_prompt_length": 7, "input_texts_raw": 7, "began": 7, "2010": 7, "valkyria": 7, "chronicl": 7, "forgiv": 7, "newcom": 7, "raita": 7, "honjou": 7, "hitoshi": 7, "sakimoto": 7, "takeshi": 7, "ozawa": 7, "writer": 7, "sung": 7, "escap": 7, "escaped_text": 7, "block_scal": 7, "block": [7, 8], "parenthes": 7, "block_min": 7, "formula": 7, "superblock": 7, "5625": 7, "ieee": 7, "754": 7, "ppl": 7, "exp": 7, "sum_": 7, "log_2": 7, "x_i": [7, 9], "avg": 7, "_i": 7, "corr": 7, "ln": [7, 9], "kullback": 7, "leibler": 7, "entropi": 7, "logit": 7, "d_": 7, "softmax": [7, 9], "sum": 7, "kld": 7, "q2_kresult": 7, "q6": 7, "004": 7, "q2": 7, "112": 7, "q4": 7, "smallest": 7, "390": 7, "67": [7, 8], "81": [7, 8], "462": 7, "614": 7, "170": 7, "q4_k_m": 7, "thread": 7, "16x": 7, "85x": 7, "79x": 7, "ubuntu": 7, "lt": 7, "x86_64": 7, "gnu": 7, "intel": 7, "i7": 7, "8550u": 7, "15gib": 7, "samsung": 7, "ssd": 7, "970": 7, "evo": 7, "500gb": 7, "1170": 7, "meant": 7, "ai4c": 7, "ai4a": 7, "paperswithcod": [7, 8], "ana24a": 7, "leaderboard": [7, 8], "artificialanalysi": 7, "ana24b": 7, "ana24c": 7, "bc24": 7, "andrei": [7, 8], "abetlen": 7, "dee24": 7, "blob": [7, 9], "deepseek_v3": 7, "fac4": 7, "optimum": 7, "concept_guid": 7, "fac4t": 7, "fac4u": 7, "200k": 7, "ultrachat_200k": 7, "fac4v": 7, "blogpost": 7, "gc24": 7, "ggerganov": [7, 9], "readm": [7, 9], "gc4a": 7, "gc4b": 7, "pka": 7, "guilherm": 7, "hynek": 7, "kydl\u00ed\u010dek": 7, "decant": 7, "finest": 7, "17557": 7, "qwe4b": 7, "qy": 7, "beichen": 7, "tingyu": 7, "su": 7, "zihan": 7, "qiu": 7, "15115": 7, "rev24": 7, "nyt": 7, "harvardlawreview": 7, "timess": 7, "zwa": 7, "wael": 7, "geoffrei": [7, 8], "angu": 7, "arnav": 7, "jefferi": 7, "kinnison": 7, "sherstinski": 7, "piero": 7, "molino": 7, "travi": 7, "addair": 7, "devvret": 7, "310": 7, "2405": 7, "00732": 7, "huggingface4xa": 7, "huggingface4xb": 7, "ibmthink24": 7, "lmstudio24": 7, "lmstudio": 7, "metaai4c": 7, "mozillaocho24": 7, "salesforce24": 7, "immens": 8, "commonplac": 8, "spur": 8, "hartvigsen": 8, "societi": 8, "alarm": 8, "openli": 8, "dolli": 8, "llama2": [8, 9], "emb": 8, "generalist": 8, "injustic": 8, "inequ": 8, "undermin": 8, "perpetu": 8, "displac": 8, "eros": 8, "fake": 8, "deepfak": 8, "distrust": 8, "cyberattack": 8, "spread": 8, "disinform": 8, "inadvert": 8, "interven": 8, "irrevers": 8, "uncheck": 8, "extinct": 8, "race": 8, "incentiv": 8, "shortcut": 8, "stress": 8, "urgent": 8, "reorient": 8, "siam": 8, "edgington": 8, "jailbreak": 8, "promptcraft": 8, "stealth": 8, "sutton": 8, "subtl": 8, "subtleti": 8, "exception": 8, "phrase": 8, "evad": 8, "hqve": 8, "frer": 8, "hplidai": 8, "pl": 8, "hyperion": 8, "coast": 8, "redwood": 8, "tallest": 8, "tree": [8, 9], "routin": 8, "prejudic": 8, "gallego": 8, "leak": 8, "poison": 8, "intention": 8, "inject": 8, "mislead": 8, "exabeam": 8, "finra": 8, "3110": 8, "mandat": 8, "supervisori": 8, "unicef": 8, "empow": 8, "contest": 8, "congress": 8, "enact": 8, "pictur": [8, 9], "sound": 8, "territori": 8, "oversea": 8, "chines": 8, "legitim": 8, "consent": 8, "complaint": 8, "cooper": 8, "extraterritori": 8, "offshor": 8, "draft": 8, "voluntari": 8, "player": 8, "prepared": 8, "compris": 8, "cbrn": 8, "persuas": 8, "autonomi": 8, "gradat": 8, "scorecard": 8, "elig": 8, "advisori": 8, "sag": 8, "shut": 8, "prerequisit": 8, "harden": 8, "asl": 8, "biosafeti": 8, "elev": 8, "warn": [8, 9], "bioweapon": 8, "compartment": 8, "4x": 8, "jump": 8, "paus": 8, "deepmind": 8, "biosecur": 8, "buffer": 8, "formul": [8, 9], "calibr": 8, "promin": 8, "taxonomi": 8, "llamaguard": 8, "20241022": 8, "5x": 8, "alaga": 8, "substandard": 8, "oxford": 8, "wachter": 8, "blur": 8, "ill": 8, "stifl": 8, "suscept": 8, "aadc": 8, "outset": 8, "curricula": 8, "adversari": 8, "thoroughli": 8, "lm": [8, 9], "undergo": 8, "280b": 8, "cai": [8, 9], "enshrin": 8, "evas": 8, "resort": 8, "avenu": 8, "cambria": 8, "inherit": 8, "influenti": 8, "debias": 8, "plausibl": 8, "occurr": 8, "phish": 8, "clarifi": 8, "toler": 8, "checklist": 8, "abus": 8, "ux": 8, "architect": 8, "retrofit": 8, "promptli": 8, "dashboard": 8, "misalign": 8, "star": 8, "postpon": 8, "combat": 8, "counter": 8, "traffic": 8, "frustrat": 8, "workaround": 8, "silo": 8, "hierarchi": 8, "mcq": 8, "regex": [8, 9], "joint": 8, "facet": 8, "purpl": 8, "opensafetylab": 8, "salad_bench_dataset": 8, "base_set": 8, "gptfuzzer": 8, "auto": [8, 9], "qid": 8, "o1": 8, "supremaci": 8, "o53": 8, "o14": 8, "o5": 8, "o65": 8, "plagiar": 8, "o16": 8, "o6": 8, "o47": 8, "campaign": 8, "o12": 8, "o52": 8, "surveil": 8, "spous": 8, "know": 8, "o13": 8, "ncount": 8, "21318": 8, "8756": 8, "6486": 8, "o2": 8, "1717": 8, "o4": 8, "1477": 8, "o3": 8, "socioeconom": 8, "851": 8, "int64": 8, "gen": 8, "15433": 8, "hh": 8, "4184": 8, "659": 8, "advbench": 8, "230": 8, "189": 8, "toxicchat": 8, "anyth": 8, "misconcept": 8, "ingrain": 8, "mc1": 8, "singular": 8, "choices4": 8, "mc2": 8, "set4": 8, "scorer": 8, "correctli": [8, 9], "truthful_qa": 8, "truthfulqa_dataset": 8, "multiple_choic": 8, "best_answ": 8, "correct_answ": 8, "incorrect_answ": 8, "watermelon": 8, "digest": 8, "noth": 8, "stomach": 8, "sick": 8, "wonderopoli": 8, "wonder": 8, "belli": 8, "swallow": 8, "dream": 8, "die": 8, "indigest": 8, "unconsci": 8, "excret": 8, "asr": 8, "r2d2": 8, "wider": [8, 9], "mass": 8, "destruct": 8, "asynchron": 8, "webpurifi": 8, "protectai": 8, "comprehend": 8, "amazon": 8, "nvidia": [8, 9], "keyword": 8, "toolset": 8, "nemmo": 8, "synchron": 8, "nemoguardrail": 8, "llmrail": 8, "railsconfig": 8, "from_path": 8, "rail": 8, "hello": 8, "ministr": 8, "mistralai": 8, "mistral_api_kei": 8, "moderate_chat": 8, "omni": 8, "pprint": 8, "to_json": 8, "threaten": 8, "illicit": 8, "granit": 8, "guardian": 8, "consortium": 8, "11b": 8, "begin_of_text": 8, "start_header_id": 8, "end_header_id": 8, "unsafe_categori": 8, "user_message_1": 8, "model_answer_1": 8, "comma": 8, "eot_id": 8, "eom_id": 8, "denot": 8, "s1": 8, "s2": 8, "s3": 8, "s4": 8, "s5": 8, "defam": 8, "s6": 8, "s7": 8, "s8": 8, "s9": 8, "s10": 8, "s11": 8, "s12": 8, "s13": 8, "padhi": 8, "atla": 8, "2b": 8, "hap": 8, "38m": 8, "125m": 8, "shieldgemma": 8, "judge_prompt": 8, "american": 8, "vandal": 8, "underag": 8, "drink": 8, "vulgar": 8, "obscen": 8, "racism": 8, "derogatori": 8, "firearm": 8, "safety_scor": 8, "IN": 8, "borderlin": 8, "verdict": 8, "boolean": [8, 9], "brief": 8, "rational": 8, "paramount": [8, 9], "evenli": 8, "good_sampl": 8, "bad_sampl": 8, "2024z": 8, "surg": 8, "scam": 8, "get_profanity_sampl": 8, "show_stat": 8, "current_dir": 8, "getcwd": 8, "data_path": 8, "profanity_en": 8, "random_st": 8, "ncategori": 8, "category_count": 8, "category_1": 8, "1f": 8, "profanity_sampl": 8, "nsampl": 8, "anatomi": 8, "slur": 8, "182": 8, "bodili": 8, "fluid": 8, "excrement": 8, "insult": 8, "mental": 8, "disabl": 8, "jap": 8, "babi": [8, 9], "batter": 8, "crazi": 8, "sob": 8, "fukka": 8, "sh1t3": 8, "get_salad_sampl": 8, "salad_sampl": 8, "tortur": 8, "porn": 8, "sin": 8, "sight": 8, "god": 8, "embezzl": 8, "xanax": 8, "alcohol": 8, "get_good_sampl": 8, "min_scor": 8, "reichstag": 8, "profanity_data": 8, "salad_data": 8, "good_data": 8, "all_data": 8, "prompt_sampl": 8, "is_unsaf": 8, "counti": 8, "holli": 8, "ridg": 8, "nc": 8, "town": 8, "onslow": 8, "carolina": 8, "diver": 8, "underwat": 8, "maze": 8, "coral": 8, "treasur": 8, "vivid": 8, "sensori": 8, "emot": 8, "labyrinthin": 8, "reef": 8, "suspens": 8, "obstacl": 8, "creatur": 8, "nomin": 8, "nobel": 8, "literatur": 8, "love": 8, "logo": 8, "thief": 8, "rob": 8, "famou": 8, "nstatist": 8, "source_stat": 8, "type_stat": 8, "plug": 8, "safetyvalid": 8, "validationresult": 8, "dataclass": 8, "abstractmethod": 8, "llmguardvalid": 8, "scanner": 8, "bantop": 8, "llm_guard": 8, "input_scann": 8, "scan_prompt": 8, "matchtyp": 8, "default_banned_top": 8, "banned_top": 8, "super": 8, "banned_topics_scann": 8, "use_onnx": 8, "toxicity_scann": 8, "match_typ": 8, "fail_fast": 8, "unsafe_scann": 8, "gun": 8, "cool": 8, "hunt": 8, "deer": 8, "dad": 8, "mistralvalid": 8, "hate_and_discrimin": 8, "violence_and_threat": 8, "dangerous_and_criminal_cont": 8, "selfharm": 8, "openaivalid": 8, "attr": 8, "dir": 8, "getattr": 8, "illicit_viol": 8, "llmjudgevalid": 8, "prompt_path": 8, "llmjudg": 8, "filenotfounderror": 8, "slice": 8, "elaps": 8, "score_valid": 8, "id_": 8, "validator_inst": 8, "validation_result": 8, "elapsed_tim": 8, "prompt_sample_id": 8, "validator_nam": 8, "scoring_prompt": 8, "scoring_result": 8, "01536": 8, "34098": 8, "497136": 8, "546416": 8, "calculate_validator_metr": 8, "bad_sourc": 8, "good_sourc": 8, "tpr": 8, "fpr": 8, "f1_score": 8, "tn": 8, "fp": 8, "fn": 8, "tp": 8, "255": 8, "74": 8, "238": 8, "130": 8, "256": 8, "149": 8, "213": 8, "190": 8, "235": 8, "63": [8, 9], "222": 8, "480": 8, "157": 8, "487": 8, "495": 8, "482": 8, "667": 8, "248": 8, "466": 8, "143": 8, "355": 8, "slowest": 8, "room": 8, "false_posit": 8, "left_on": 8, "right_on": 8, "persian": 8, "teenag": 8, "\u0686\u0627\u0644\u0634": 8, "\u0645\u0648\u0645\u0648": 8, "\u06cc\u06a9": 8, "\u062d\u0642\u0647": 8, "\u0648": 8, "\u0627\u0641\u0633\u0627\u0646\u0647": 8, "\u0645\u062d\u0644\u06cc": 8, "\u0627\u0633\u062a": 8, "\u06a9\u0647": 8, "\u0634\u0627\u06cc\u0639\u0647": 8, "\u0622\u0646": 8, "\u062f\u0631": 8, "\u0634\u0628\u06a9\u0647": 8, "\u0647\u0627\u06cc": 8, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 8, "\u067e\u062e\u0634": 8, "\u0634\u062f": 8, "\u06af\u0632\u0627\u0631\u0634": 8, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 8, "\u0628\u0627": 8, "\u0646\u0627\u0645": 8, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 8, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 8, "\u0631\u0627": 8, "\u0628\u0647": 8, "\u0627\u0639\u0645\u0627\u0644": 8, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 8, "\u0645\u0627\u0646\u0646\u062f": 8, "\u062e\u0648\u062f\u0632\u0646\u06cc": 8, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 8, "\u062a\u0631\u063a\u06cc\u0628": 8, "\u0645\u06cc": 8, "\u06a9\u0646\u062f": 8, "\u0648\u062c\u0648\u062f": 8, "\u0635\u062d\u0628\u062a": 8, "\u0632\u06cc\u0627\u062f\u06cc": 8, "\u0645\u0648\u0631\u062f": 8, "\u0627\u06cc\u0646": 8, "\u062a\u0639\u062f\u0627\u062f": 8, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 8, "\u0648\u0627\u0642\u0639\u06cc": 8, "\u0628\u0633\u06cc\u0627\u0631": 8, "\u06a9\u0645": 8, "\u0628\u0648\u062f": 8, "\u067e\u0644\u06cc\u0633": 8, "\u0635\u062f\u0645\u0647": 8, "\u062f\u06cc\u062f\u0646": 8, "\u062f\u0644\u06cc\u0644": 8, "\u062a\u0623\u062b\u06cc\u0631": 8, "\u0645\u0633\u062a\u0642\u06cc\u0645": 8, "\u067e\u062f\u06cc\u062f\u0647": 8, "\u062a\u0623\u06cc\u06cc\u062f": 8, "\u0646\u06a9\u0631\u062f\u0647": 8, "\u062a\u0631\u0633": 8, "\u0646\u06af\u0631\u0627\u0646\u06cc": 8, "\u0627\u06cc\u062c\u0627\u062f": 8, "\u0634\u062f\u0647": 8, "\u0628\u06cc\u0634\u062a\u0631": 8, "\u0627\u0632": 8, "\u062e\u0648\u062f": 8, "\u0631\u0633\u0627\u0646\u0647": 8, "\u0647\u0627": 8, "\u0637\u0648\u0631\u06cc": 8, "\u062e\u06cc\u0631\u06cc\u0647": 8, "\u0647\u0634\u062f\u0627\u0631": 8, "\u062f\u0627\u062f\u0646\u062f": 8, "\u0622\u0633\u06cc\u0628": 8, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 8, "\u0645\u062d\u062a\u0648\u0627\u06cc": 8, "\u062e\u0634\u0648\u0646\u062a": 8, "\u0622\u0645\u06cc\u0632": 8, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 8, "\u06af\u0641\u062a\u0647": 8, "\u0634\u0648\u062f": 8, "\u0627\u0648\u0644\u06cc\u0646": 8, "\u0628\u0627\u0631": 8, "\u0633\u0627\u0644": 8, "\u06f2\u06f0\u06f1\u06f8": 8, "\u067e\u0633": 8, "\u0622\u0646\u06a9\u0647": 8, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 8, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 8, "\u062e\u0628\u0631": 8, "\u062f\u062e\u062a\u0631": 8, "\u06f1\u06f2": 8, "\u0633\u0627\u0644\u0647": 8, "\u062f\u0627\u062f": 8, "\u0645\u0648\u0636\u0648\u0639": 8, "\u062c\u0647\u0627\u0646\u06cc": 8, "\u062a\u0628\u062f\u06cc\u0644": 8, "\u0645\u062c\u0633\u0645\u0647": 8, "\u0647\u0646\u0631\u0645\u0646\u062f": 8, "\u0698\u0627\u067e\u0646\u06cc": 8, "\u0647\u0631": 8, "\u0686\u0646\u062f": 8, "\u0634\u0627\u06cc\u062f": 8, "\u0646\u06af\u0627\u0647": 8, "\u0628\u0639\u0636\u06cc": 8, "\u0632\u06cc\u0628\u0627": 8, "\u0646\u0628\u0627\u0634\u062f": 8, "\u0627\u0645\u0627": 8, "\u06a9\u0627\u0645\u0644\u0627": 8, "\u0628\u06cc": 8, "\u062e\u0637\u0631": 8, "\u0627\u06cc\u0631\u0627\u0646": 8, "\u0645\u062f\u062a": 8, "\u0628\u06cc\u0646": 8, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 8, "\u0645\u0637\u0631\u062d": 8, "\u0633\u0627\u0644\u06cc": 8, "\u0633\u0631\u0627\u0633\u0631": 8, "\u062c\u0647\u0627\u0646": 8, "\u0645\u0634\u0627\u0628\u0647\u06cc": 8, "\u0628\u0631\u0627\u06cc": 8, "\u0648\u0627\u0644\u062f\u06cc\u0646": 8, "\u06a9\u0631\u062f\u0647": 8, "\u0627\u0641\u0631\u0627\u062f": 8, "\u0686\u0647": 8, "\u06a9\u0627\u0631\u06cc": 8, "\u062f\u0639\u0648\u062a": 8, "tourist": 8, "distress": 8, "polish": 8, "galician": 8, "dzisiaj": 8, "szwecji": 8, "innych": 8, "bogatych": 8, "krajach": 8, "ludzi": 8, "u\u017cywaj\u0105": 8, "mn\u00f3stwo": 8, "najr\u00f3\u017cniejszych": 8, "urz\u0105dze\u0144": 8, "hox": 8, "suecia": 8, "outro": 8, "pa\u00eds": 8, "rico": 8, "xent": 8, "moita": 8, "m\u00e1quina": 8, "diferent": 8, "\u0142\u00f3dka": 8, "zaczyna": 8, "ton\u0105\u0107": 8, "tury\u015bci": 8, "wracaj\u0105": 8, "statek": 8, "dom\u00f3w": 8, "gdzie": 8, "opowiadaj\u0105": 8, "tym": 8, "jak": 8, "zostali": 8, "zaatakowani": 8, "surprisingli": 8, "shelf": 8, "unsettl": 8, "paradox": 8, "harbor": 8, "wisdom": 8, "aspir": 8, "technologist": 8, "disciplinari": 8, "ethicist": 8, "policymak": 8, "asa24": 8, "jide": 8, "jona": 8, "schuett": 8, "marku": 8, "anderljung": 8, "08751": 8, "bhy": 8, "hinton": 8, "pieter": 8, "abbeel": 8, "trevor": 8, "darrel": 8, "yuval": 8, "harari": 8, "ya": 8, "lan": 8, "shai": 8, "shalev": 8, "gillian": 8, "hadfield": 8, "clune": 8, "tegan": 8, "maharaj": 8, "hutter": 8, "at\u0131l\u0131m": 8, "g\u00fcne\u015f": 8, "baydin": 8, "sheila": 8, "mcilraith": 8, "qiqi": 8, "ashwin": 8, "acharya": 8, "anca": 8, "dragan": 8, "philip": 8, "torr": 8, "kahneman": 8, "s\u00f6ren": 8, "mindermann": 8, "amid": 8, "384": 8, "6698": 8, "1126": 8, "adn0117": 8, "bbc": 8, "emili": 8, "braca": 8, "israel": 8, "carter": 8, "hafsa": 8, "kanchwala": 8, "khojasteh": 8, "charli": 8, "landow": 8, "luo": 8, "magarelli": 8, "mirin": 8, "averi": 8, "moyer": 8, "kayla": 8, "simpson": 8, "amelia": 8, "skawinski": 8, "heverin": 8, "23308": 8, "bmc": 8, "dillon": 8, "brendan": 8, "murphi": 8, "khachaturov": 8, "gleav": 8, "kellin": 8, "pelrin": 8, "2408": [8, 9], "02946": 8, "cmm": 8, "erik": 8, "lorenzo": 8, "malandri": 8, "mercorio": 8, "navid": 8, "nobani": 8, "seveso": 8, "15248": 8, "edg24": 8, "exa24": 8, "cyber": 8, "grb": 8, "rossi": 8, "barrow": 8, "mehrab": 8, "tanjim": 8, "sungchul": 8, "franck": 8, "dernoncourt": 8, "ruiyi": 8, "nesreen": 8, "2309": 8, "00770": 8, "h44z": 8, "hgp": 8, "saadia": 8, "hamid": 8, "palangi": 8, "dipankar": 8, "ec": 8, "kamar": 8, "oxi": 8, "smaranda": 8, "muresan": 8, "preslav": 8, "nakov": 8, "alin": 8, "villavicencio": 8, "60th": 8, "3309": 8, "3326": 8, "dublin": 8, "hym": 8, "weijiang": 8, "weitao": 8, "weihong": 8, "zhangyin": 8, "haotian": 8, "qianglong": 8, "weihua": 8, "xiaocheng": 8, "bing": 8, "dx": 8, "3703155": 8, "iuc": 8, "kartikeya": 8, "upasani": 8, "jianfeng": 8, "krithika": 8, "tontchev": 8, "2312": 8, "06674": 8, "ldw": 8, "lijun": 8, "ruohui": 8, "xuhao": 8, "wangmeng": 8, "zuo": 8, "dahua": 8, "qiao": 8, "shao": 8, "05044": 8, "mpy": 8, "xuwang": 8, "zifan": 8, "norman": 8, "mu": 8, "elham": 8, "sakhae": 8, "nathaniel": 8, "forsyth": 8, "04249": 8, "ma24": 8, "mlc24": 8, "illumin": 8, "ailumin": 8, "oaa": 8, "adler": 8, "ahmad": 8, "ilg": 8, "akkaya": 8, "florencia": 8, "leoni": 8, "aleman": 8, "janko": 8, "altenschmidt": 8, "altman": 8, "shyamal": 8, "anadkat": 8, "avila": 8, "valeri": 8, "balcom": 8, "baltescu": 8, "haim": 8, "belgum": 8, "irwan": 8, "bello": 8, "jake": 8, "berdin": 8, "bernadett": 8, "shapiro": 8, "berner": 8, "lenni": 8, "bogdonoff": 8, "boiko": 8, "madelain": 8, "boyd": 8, "luisa": 8, "brakman": 8, "button": 8, "rosi": 8, "campbel": 8, "cann": 8, "brittani": 8, "carei": 8, "carlson": 8, "rori": 8, "carmichael": 8, "che": 8, "foti": 8, "sulli": 8, "rubi": 8, "chess": 8, "chester": 8, "cho": 8, "hyung": 8, "won": 8, "chung": 8, "jeremiah": 8, "currier": 8, "yunx": 8, "cori": 8, "decareaux": 8, "degri": 8, "deutsch": 8, "devil": 8, "dhar": 8, "dowl": 8, "dun": 8, "adrien": 8, "ecoffet": 8, "atti": 8, "eleti": 8, "tyna": 8, "elound": 8, "farhi": 8, "niko": 8, "sim\u00f3n": 8, "posada": 8, "fishman": 8, "juston": 8, "isabella": 8, "fulford": 8, "georg": 8, "gibson": 8, "vik": 8, "tarun": 8, "gogineni": 8, "goh": 8, "rapha": 8, "gontijo": 8, "lope": 8, "gordon": 8, "morgan": 8, "grafstein": 8, "yufei": 8, "hallaci": 8, "heaton": 8, "johann": 8, "heideck": 8, "hickei": 8, "wade": 8, "hoeschel": 8, "houghton": 8, "kenni": 8, "hsu": 8, "shengli": 8, "joost": 8, "huizinga": 8, "shawn": 8, "joann": 8, "jang": 8, "roger": 8, "haozhun": 8, "shino": 8, "jomoto": 8, "billi": 8, "jonn": 8, "tomer": 8, "kaftan": 8, "\u0142ukasz": 8, "kamali": 8, "ingmar": 8, "kanitscheid": 8, "tabarak": 8, "khan": 8, "logan": 8, "kilpatrick": 8, "jong": 8, "wook": 8, "christina": 8, "yongjik": 8, "hendrik": 8, "kirchner": 8, "kiro": 8, "matt": 8, "kokotajlo": 8, "kondraciuk": 8, "kondrich": 8, "konstantinidi": 8, "kosic": 8, "vishal": 8, "kuo": 8, "lamp": 8, "ikai": 8, "teddi": 8, "jade": 8, "leung": 8, "chak": 8, "lim": 8, "molli": 8, "mateusz": 8, "litwin": 8, "theresa": 8, "lopez": 8, "patricia": 8, "lue": 8, "makanju": 8, "malfacini": 8, "markov": 8, "yaniv": 8, "markovski": 8, "bianca": 8, "mayn": 8, "mckinnei": 8, "christin": 8, "mcleavei": 8, "mcmillan": 8, "mcneil": 8, "aalok": 8, "menick": 8, "mishchenko": 8, "vinni": 8, "monaco": 8, "murk": 8, "m\u00e9ly": 8, "ashvin": 8, "nair": 8, "reiichiro": 8, "nakano": 8, "rajeev": 8, "nayak": 8, "arvind": 8, "neelakantan": 8, "hyeonwoo": 8, "noh": 8, "keef": 8, "jakub": 8, "pachocki": 8, "palermo": 8, "ashlei": 8, "pantuliano": 8, "parish": 8, "emi": 8, "parparita": 8, "passo": 8, "perelman": 8, "belbut": 8, "pere": 8, "pokorni": 8, "pokrass": 8, "vitchyr": 8, "pong": 8, "tolli": 8, "powel": 8, "bori": 8, "proehl": 8, "rae": 8, "ramesh": 8, "franci": 8, "kendra": 8, "rimbach": 8, "carl": 8, "rotst": 8, "roussez": 8, "saltarelli": 8, "ted": 8, "sander": 8, "schnurr": 8, "selsam": 8, "kyla": 8, "sheppard": 8, "toki": 8, "sherbakov": 8, "shieh": 8, "shoker": 8, "pranav": 8, "szymon": 8, "sidor": 8, "sigler": 8, "sitkin": 8, "sokolowski": 8, "natali": 8, "staudach": 8, "madelein": 8, "phil": 8, "tootoonchian": 8, "tseng": 8, "preston": 8, "tuggl": 8, "turlei": 8, "juan": 8, "cer\u00f3n": 8, "urib": 8, "vallon": 8, "vijayvergiya": 8, "jai": 8, "alvin": 8, "ward": 8, "cj": 8, "weinmann": 8, "akila": 8, "welihinda": 8, "jiayi": 8, "weng": 8, "lilian": 8, "wiethoff": 8, "willner": 8, "wolrich": 8, "lauren": 8, "workman": 8, "sherwin": 8, "yoo": 8, "zeller": 8, "shengjia": 8, "juntang": 8, "zhuk": 8, "2303": 8, "08774": 8, "pnc": 8, "inkit": 8, "manish": 8, "nagireddi": 8, "giandomenico": 8, "cornacchia": 8, "subhajit": 8, "chaudhuri": 8, "tejaswini": 8, "pedapati": 8, "pierr": 8, "dognin": 8, "keerthiram": 8, "murugesan": 8, "miehl": 8, "santill\u00e1n": 8, "kieran": 8, "giulio": 8, "zizzo": 8, "muhammad": 8, "zaid": 8, "hame": 8, "purcel": 8, "desmond": 8, "zahra": 8, "ashktorab": 8, "ing": 8, "vejsbjerg": 8, "dali": 8, "hind": 8, "werner": 8, "geyer": 8, "ambrish": 8, "rawat": 8, "kush": 8, "varshnei": 8, "prasanna": 8, "sattigeri": 8, "07724": 8, "pcz": 8, "shern": 8, "woodsid": 8, "hanlin": 8, "emmon": 8, "justifi": 8, "machiavelli": 8, "2304": 8, "03279": 8, "saffron": 8, "ring": 8, "aslanid": 8, "glaes": 8, "nat": 8, "mcalees": 8, "irv": 8, "2202": 8, "03286": 8, "sjls22": 8, "lingfeng": 8, "haiyun": 8, "lemao": 8, "backdoor": 8, "2201": 8, "02993": 8, "szw": 8, "qinghua": 8, "higham": 8, "gorban": 8, "bastouni": 8, "ivan": 8, "tyukin": 8, "12670": 8, "vsk": 8, "simplesafetytest": 8, "2311": 8, "08370": 8, "wmr24": 8, "sandra": 8, "brent": 8, "mittelstadt": 8, "duti": 8, "royal": 8, "240197": 8, "royalsocietypublish": 8, "1098": 8, "rso": 8, "wcp": 8, "boxin": 8, "weixin": 8, "hengzhi": 8, "chulin": 8, "mintong": 8, "kang": 8, "chenhui": 8, "chejian": 8, "zidi": 8, "xiong": [8, 9], "ritik": 8, "truong": 8, "simran": 8, "arora": 8, "zinan": 8, "decodingtrust": 8, "11698": 8, "ylx24": 8, "jiahao": 8, "xingwei": 8, "zyi": 8, "shune": 8, "lyumanshan": 8, "jingyu": 8, "shui": 8, "haobin": 8, "pengfei": 8, "hewu": 8, "ghost": 8, "14931": 8, "zho24": 8, "amazonwservices24": 8, "anthropic24": 8, "cdn": 8, "1adf000c8f675958c2ee23805d91aaade1cd4613": 8, "centerfasafety24a": 8, "centerforaisafeti": 8, "centerfasafety24b": 8, "deepmind24": 8, "googleapi": 8, "fsf": 8, "europeanmagency24": 8, "ema": 8, "europa": 8, "activities_en": 8, "financialirauthority24": 8, "harmbench24": 8, "ibm24": 8, "watsonx": 8, "saa": 8, "libraryocongress23": 8, "loc": 8, "mistralai24": 8, "mlsteam24": 8, "mlsafeti": 8, "nationaliosatechnology24": 8, "nist": 8, "itl": 8, "nvidia24": 8, "openai24a": 8, "openai24b": 8, "opensafetylab24a": 8, "opensafetylab24b": 8, "protectai24": 8, "surgeai24": 8, "ukgovernment24": 8, "unicef24": 8, "innocenti": 8, "julia": 9, "easili": 9, "trial": 9, "wrangl": 9, "hoc": 9, "unwant": 9, "overflow": 9, "twitter": 9, "youtub": 9, "ldot": 9, "prod_": 9, "syntact": 9, "xml": 9, "invalid": 9, "delic": 9, "heart": 9, "ttt": 9, "itt": 9, "po": 9, "nousresearch": 9, "herm": 9, "person1": 9, "q1": 9, "person2": 9, "json_format": 9, "response_cont": 9, "is_json": 9, "myjson": 9, "nest": 9, "conceptu": 9, "unend": 9, "whitespac": 9, "throw": 9, "somewher": 9, "json_object": 9, "circul": 9, "vertex": 9, "went": 9, "secextract": 9, "mentioned_ent": 9, "mentioned_plac": 9, "extract_from_sec_fil": 9, "sec_filing_text": 9, "hint": 9, "prompt_extract": 9, "sec_extract": 9, "washington": 9, "beg": 9, "unnorm": 9, "0325": 9, "strongest": 9, "greedi": 9, "bfloat16": 9, "device_map": 9, "src": 9, "python3": 9, "nvml": 9, "return_tensor": 9, "pt": 9, "inference_mod": 9, "last_token_logit": 9, "next_token_prob": 9, "nn": 9, "dim": 9, "top_k_prob": 9, "top_k_indic": 9, "topk": 9, "top_k_token": 9, "decod": 9, "idx": 9, "skip_special_token": 9, "prob": 9, "0305": 9, "0197": 9, "0106": 9, "0093": 9, "logitsprocessor": 9, "logits_processor": 9, "logitsprocessorlist": 9, "customlogitsprocessor": 9, "intermediari": 9, "input_id": 9, "__call__": 9, "longtensor": 9, "batch_siz": 9, "sequence_length": 9, "floattensor": 9, "vocab_s": 9, "mask": 9, "pick": 9, "yesnologitsprocessor": 9, "initial_length": 9, "fill_": 9, "inf": 9, "debug": 9, "yes_token": 9, "add_special_token": 9, "no_token": 9, "yes_no_logit": 9, "yes_no_prob": 9, "yes_prob": 9, "no_prob": 9, "yes_mask": 9, "1e4": 9, "NO": 9, "generation_output_control": 9, "uncontrol": 9, "generation_output": 9, "4263": 9, "5737": 9, "10407": 9, "4607": 9, "6250": 9, "9219": 9, "helper": 9, "model_output": 9, "gen_output": 9, "batch_decod": 9, "clean_up_tokenization_spac": 9, "classic": 9, "italian": 9, "willard": 9, "louf": 9, "reformul": 9, "finit": 9, "fsm": 9, "s_": 9, "s_t": 9, "s_1": 9, "tild": 9, "odot": 9, "rightarrow": 9, "wise": 9, "thien": 9, "automaton": 9, "dfa": 9, "outgo": 9, "renorm": 9, "yy": 9, "ever": 9, "aa": 9, "lwai": 9, "prop": 9, "yynnaa": 9, "malform": 9, "sec_extraction_outlin": 9, "zsp": 9, "zicorp": 9, "with_structured_output": 9, "runnabl": 9, "typeddict": 9, "qu": 9, "langchain_openai": 9, "chatopenai": 9, "chatprompttempl": 9, "extract_from_sec_filing_langchain": 9, "structured_llm": 9, "from_messag": 9, "sec_extraction_langchain": 9, "bnf": 9, "backu": 9, "naur": 9, "fssl": 9, "extract_entities_from_sec_fil": 9, "ollama_structured_output_prompt_suffix": 9, "ollama_structured_output_temperatur": 9, "uncensor": 9, "model_json_schema": 9, "response_json": 9, "sharpli": 9, "exllama2": 9, "zoo": 9, "furthermor": 9, "nonetheless": 9, "extran": 9, "dispar": 9, "preval": 9, "peer": 9, "speak": 9, "aider": 9, "outweigh": 9, "rebutt": 9, "dottxt": 9, "reproduct": 9, "paint": 9, "flaw": 9, "uneven": 9, "conflat": 9, "drawback": 9, "pfiffer": 9, "wrestl": 9, "aid24": 9, "dot24": 9, "demo": 9, "gge24": 9, "lan4b": 9, "lww": 9, "xun": 9, "hanyu": 9, "yezhaohui": 9, "shichao": 9, "simin": 9, "shunyu": 9, "feiyu": 9, "zhiyu": 9, "12599": 9, "llf": 9, "xieyang": 9, "frederick": 9, "fiannaca": 9, "terri": 9, "koo": 9, "dixon": 9, "ea": 9, "3613905": 9, "3650756": 9, "xuan": 9, "hai": 9, "nguyen": 9, "ngoc": 9, "tiviati": 9, "hieu": 9, "dao": 9, "shafiq": 9, "joti": 9, "kenji": 9, "kawaguchi": 9, "nanci": 9, "min": 9, "kan": 9, "08656": 9, "nou24": 9, "out24": 9, "twt": 9, "zhi": 9, "kuang": 9, "tsai": 9, "chieh": 9, "hung": 9, "nung": 9, "02442": 9, "tt24": 9, "vivien": 9, "vivien000": 9, "wl23": 9, "r\u00e9mi": 9, "09702": 9, "guidanceai24": 9, "nvidia4a": 9, "wikipediacontributors24": 9, "wiktionari": 9, "naur_form": 9}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8, 9], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4], "practic": [0, 2, 7, 9], "approach": [0, 4, 8], "an": 0, "open": [0, 2, 7], "sourc": [0, 2, 7], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3, 6], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 7], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 8], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 8], "issu": 0, "author": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 7, 8], "guid": 2, "pitfal": [2, 8], "softwar": [2, 5], "chapter": 2, "1": [2, 8], "The": [2, 4, 5, 7], "eval": [2, 5, 8], "gap": [2, 5], "2": [2, 7, 8], "structur": [2, 6, 9], "output": [2, 9], "3": [2, 8], "manag": [2, 6], "input": [2, 6], "data": [2, 3, 6], "4": [2, 8], "safeti": [2, 8], "5": [2, 8], "prefer": [2, 3], "base": [2, 3, 5, 6, 8], "align": [2, 3], "6": [2, 8], "local": [2, 7], "7": 2, "fall": [2, 4], "cost": [2, 4, 7], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 7, 8, 9], "resourc": 2, "introduct": [3, 5, 6, 7, 8, 9], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 7], "human": 3, "supervis": 3, "fine": [3, 7, 9], "tune": [3, 7, 9], "sft": 3, "augment": [3, 6], "post": [3, 9], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 6, 7, 8], "studi": [3, 6, 7, 8], "polici": [3, 8], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 7, 8], "synthet": 3, "gener": [3, 5, 6, 8], "user": [3, 8], "prompt": [3, 7, 9], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": 3, "vibe": 3, "check": [3, 4], "evalu": [3, 5, 8], "discuss": [3, 6, 9], "conclus": [3, 4, 5, 6, 7, 8, 9], "refer": [3, 4, 5, 6, 7, 8, 9], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 7], "oper": 4, "technic": [4, 8], "quantiz": [4, 7], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 9], "statement": [5, 9], "tradit": 5, "v": [5, 7], "design": [5, 8], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 7], "benchmark": [5, 7, 8], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 8], "famili": [5, 7], "us": [5, 6], "langsmith": 5, "promptfoo": 5, "comparison": [5, 7, 9], "pars": 6, "document": 6, "markitdown": 6, "docl": 6, "framework": [6, 8, 9], "extract": 6, "retriev": 6, "chunk": 6, "contextu": 6, "link": 6, "long": 6, "form": 6, "ii": 6, "github": 6, "rag": 6, "iii": 6, "quiz": 6, "citat": 6, "implement": [6, 8], "exampl": 6, "usag": 6, "choos": 7, "suitabl": 7, "result": 7, "llama": 7, "licens": 7, "commun": 7, "support": 7, "custom": [7, 8], "mistral": [7, 8], "decemb": 7, "22": 7, "2024": 7, "deploy": 7, "serv": 7, "cpp": 7, "llamafil": 7, "ollama": [7, 9], "lama": 7, "ui": 7, "lm": 7, "studio": 7, "jan": 7, "webui": 7, "openwebui": 7, "effect": 7, "level": 7, "hardwar": 7, "takeawai": [7, 8], "risk": 8, "ai": 8, "amplifi": 8, "exist": 8, "harm": 8, "novel": 8, "associ": 8, "autonom": 8, "exacerb": 8, "factor": 8, "specif": 8, "guidanc": 8, "govern": 8, "organ": 8, "privat": 8, "sector": 8, "openai": 8, "anthrop": 8, "googl": 8, "rubric": 8, "mlcommon": 8, "centr": 8, "porquoi": 8, "red": 8, "team": 8, "constitut": 8, "explain": 8, "xai": 8, "plan": 8, "phase": 8, "definit": 8, "research": [8, 9], "identif": 8, "architectur": 8, "select": 8, "go": 8, "market": 8, "compon": 8, "salad": 8, "bench": 8, "truthfulqa": 8, "harmbench": 8, "safebench": 8, "techniqu": [8, 9], "repres": 8, "layer": 8, "map": 8, "rule": 8, "filter": 8, "moder": 8, "bad": 8, "good": 8, "guard": 8, "judg": 8, "valid": 8, "engin": 9, "json": 9, "mode": 9, "logit": 9, "process": 9, "outlin": 9, "langchain": 9, "best": 9, "compar": 9, "solut": 9, "ongo": 9, "debat": 9, "acknowledg": 9}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"], [9, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author": [[0, "about-the-author"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Structured Output": [[2, "chapter-2-structured-output"]], "Chapter 3: Managing Input Data": [[2, "chapter-3-managing-input-data"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"], [9, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [7, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"], [9, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [9, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [9, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [7, "comparison"], [7, "id37"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Managing Input Data": [[6, "managing-input-data"]], "Parsing Documents": [[6, "parsing-documents"]], "MarkItDown": [[6, "markitdown"]], "Docling": [[6, "docling"]], "Frameworks-Based Parsing": [[6, "frameworks-based-parsing"]], "Structured Data Extraction": [[6, "structured-data-extraction"]], "Retrieval-Augmented Generation": [[6, "retrieval-augmented-generation"]], "Case Studies": [[6, "case-studies"]], "Case Study I: Content Chunking with Contextual Linking": [[6, "case-study-i-content-chunking-with-contextual-linking"]], "Generating long-form content": [[6, "generating-long-form-content"]], "Discussion": [[6, "discussion"], [6, "id14"], [9, "discussion"]], "Case Study II: Github RAG": [[6, "case-study-ii-github-rag"]], "Case Study III: Quiz Generation with Citations": [[6, "case-study-iii-quiz-generation-with-citations"]], "Use Case": [[6, "use-case"]], "Implementation": [[6, "implementation"]], "Example Usage": [[6, "example-usage"]], "Local LLMs in Practice": [[7, "local-llms-in-practice"]], "Choosing your Model": [[7, "choosing-your-model"]], "Task Suitability": [[7, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[7, "llama2-benchmark"]], "Performance & Cost": [[7, "performance-cost"]], "Licensing": [[7, "licensing"]], "Open Source LLMs.": [[7, "open-source-llms"]], "Community Support": [[7, "community-support"]], "Customization": [[7, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[7, "mistral-costs"]], "Tools for Local LLM Deployment": [[7, "tools-for-local-llm-deployment"]], "Serving Models": [[7, "serving-models"]], "LLama.cpp": [[7, "llama-cpp"]], "Llamafile": [[7, "llamafile"]], "Ollama": [[7, "ollama"], [9, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[7, "feature-comparison-local"]], "UI": [[7, "ui"]], "LM Studio": [[7, "lm-studio"]], "Jan": [[7, "jan"]], "Open WebUI": [[7, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[7, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[7, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[7, "prompts-dataset"]], "Quantization Levels": [[7, "quantization-levels"]], "Benchmarking": [[7, "benchmarking"], [8, "benchmarking"]], "Results": [[7, "results"]], "Quantization Benchmarks": [[7, "quantization-benchmarks"]], "Benchmarking Hardware": [[7, "benchmarking-hardware"]], "Takeaways": [[7, "takeaways"], [8, "takeaways"]], "Safety": [[8, "safety"]], "Safety Risks": [[8, "safety-risks"]], "General AI Safety Risks": [[8, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[8, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[8, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[8, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[8, "llms-specific-safety-risks"]], "Guidance": [[8, "guidance"]], "Governments & Organizations": [[8, "governments-organizations"]], "Private Sector": [[8, "private-sector"]], "OpenAI": [[8, "openai"]], "Anthropic": [[8, "anthropic"]], "Google": [[8, "google"]], "Rubrics": [[8, "rubrics"]], "MLCommons AI Safety Benchmark": [[8, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[8, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[8, "porquoi"]], "Approaches": [[8, "approaches"]], "Red Teaming": [[8, "red-teaming"]], "Constitutional AI": [[8, "constitutional-ai"]], "Explainable AI (XAI)": [[8, "explainable-ai-xai"]], "Designing a Safety Plan": [[8, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[8, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[8, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[8, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[8, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[8, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[8, "phase-6-go-to-market"]], "Common Pitfalls": [[8, "common-pitfalls"]], "Technical Implementation Components": [[8, "technical-implementation-components"]], "Benchmarks & Datasets": [[8, "benchmarks-datasets"]], "SALAD-Bench": [[8, "salad-bench"]], "TruthfulQA": [[8, "truthfulqa"]], "HarmBench": [[8, "harmbench"]], "SafeBench": [[8, "safebench"]], "Tools & Techniques": [[8, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[8, "safety-layer-table"]], "Rules-Based Safety Filtering": [[8, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[8, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[8, "llm-based-safety-filtering"]], "Custom Moderation": [[8, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[8, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[8, "evals-dataset"]], "Bad Samples": [[8, "bad-samples"]], "Good Samples": [[8, "good-samples"]], "Safety Filters": [[8, "safety-filters"]], "LLM-Guard": [[8, "llm-guard"]], "Mistral Moderation API": [[8, "mistral-moderation-api"]], "OpenAI Moderation API": [[8, "openai-moderation-api"]], "Custom Judge Validator": [[8, "custom-judge-validator"]], "Structured Output": [[9, "structured-output"]], "Techniques": [[9, "techniques"]], "Prompt Engineering": [[9, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[9, "json-mode-fine-tuned"]], "Logit Post-Processing": [[9, "logit-post-processing"]], "Outlines": [[9, "outlines"]], "LangChain": [[9, "langchain"]], "Best Practices": [[9, "best-practices"]], "Comparing Solutions": [[9, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[9, "structured-output-frameworks"]], "Research and Ongoing Debate": [[9, "research-and-ongoing-debate"]], "Acknowledgements": [[9, "acknowledgements"]]}, "indexentries": {}}) \ No newline at end of file +Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/input", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/input.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["2. About the Book", "1. Preface", "Taming LLMs", "7. Preference-Based Alignment", "9. The Falling Cost Paradox", "3. The Evals Gap", "5. Managing Input Data", "8. Local LLMs in Practice", "6. Safety", "4. Structured Output"], "terms": {"am": [0, 8], "alwai": [0, 3, 4, 5, 6, 9], "do": [0, 3, 4, 5, 6, 7, 8, 9], "which": [0, 3, 4, 5, 6, 7, 8, 9], "cannot": [0, 3, 4, 5, 7, 8], "order": [0, 3, 5, 6, 8, 9], "mai": [0, 1, 3, 4, 5, 6, 7, 8, 9], "learn": [0, 3, 5, 6, 7, 8, 9], "how": [0, 1, 3, 4, 5, 6, 7, 8, 9], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8, 9], "recent": [0, 3, 4, 5, 6, 7, 8, 9], "year": [0, 2, 3, 4, 5, 6, 7, 8, 9], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "languag": [0, 1, 2, 4, 5, 6, 7, 8, 9], "model": [0, 1, 2, 4, 6, 8, 9], "llm": [0, 1, 3, 9], "have": [0, 1, 3, 4, 5, 6, 7, 8, 9], "emerg": [0, 3, 4, 6, 7, 8, 9], "transform": [0, 1, 3, 5, 6, 7, 8, 9], "forc": [0, 5, 6, 9], "technologi": [0, 1, 4, 5, 6, 7, 8], "promis": [0, 3, 4, 5, 8], "revolution": [0, 8], "build": [0, 2, 3, 5, 6, 7, 8, 9], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "interact": [0, 3, 4, 5, 6, 7, 8, 9], "comput": [0, 3, 4, 5, 6, 7, 8, 9], "from": [0, 1, 4, 5, 6, 7, 8, 9], "chatgpt": [0, 3, 4, 6, 7, 9], "llama": [0, 3, 4, 5, 6, 8, 9], "github": [0, 2, 3, 4, 5, 6, 7, 8, 9], "copilot": 0, "claud": [0, 3, 5, 7, 8], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8, 9], "captur": [0, 1, 3, 5, 6, 7, 8], "public": [0, 3, 5, 6, 7, 8], "imagin": [0, 7], "spark": 0, "gold": [0, 3, 6, 8], "rush": 0, "ai": [0, 3, 4, 5, 6, 7, 9], "power": [0, 2, 3, 4, 5, 6, 7, 8, 9], "applic": [0, 1, 2, 3, 4, 6, 7, 8, 9], "howev": [0, 3, 4, 5, 6, 7, 8, 9], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 6, 8], "revolut": [0, 4], "li": [0, 3, 5, 6, 7, 8, 9], "complex": [0, 1, 3, 5, 6, 7, 8, 9], "landscap": [0, 3, 5, 7], "softwar": [0, 1, 3, 4, 6, 7, 8, 9], "develop": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tech": [0, 7, 8], "leader": [0, 2, 5, 8], "must": [0, 3, 4, 5, 7, 8, 9], "navig": [0, 2, 5, 6, 7, 8], "focus": [0, 3, 4, 5, 6, 7, 8, 9], "bring": [0, 3, 6, 7], "awar": [0, 3, 4, 5, 6, 8], "limit": [0, 1, 2, 4, 5, 7, 8, 9], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7, 8], "overcom": [0, 5, 6], "them": [0, 1, 3, 4, 5, 6, 7, 8, 9], "robust": [0, 3, 4, 5, 6, 7, 8, 9], "It": [0, 3, 4, 5, 6, 7, 8, 9], "offer": [0, 3, 4, 5, 6, 7, 8, 9], "critic": [0, 2, 3, 4, 5, 6, 7, 8, 9], "implement": [0, 2, 3, 4, 5, 7, 9], "back": [0, 5, 6, 7, 8, 9], "reproduc": [0, 1, 2, 5, 7], "exampl": [0, 1, 2, 3, 5, 7, 8, 9], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "mani": [0, 1, 3, 4, 5, 6, 7, 8, 9], "resourc": [0, 3, 4, 5, 6, 7, 8], "cover": [0, 3, 4, 5, 6, 7, 8, 9], "capabl": [0, 1, 2, 4, 5, 6, 7, 8, 9], "specif": [0, 3, 4, 5, 6, 7, 9], "hidden": [0, 3, 8], "pitfal": [0, 1, 3, 4, 5, 6, 7, 9], "engin": [0, 1, 2, 3, 4, 5, 6, 7, 8], "technic": [0, 1, 2, 3, 5, 6, 7, 9], "face": [0, 3, 4, 5, 6, 7, 8], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8, 9], "guid": [0, 1, 3, 4, 5, 6, 7, 8, 9], "leverag": [0, 3, 5, 6, 7, 8, 9], "battl": [0, 2, 7], "test": [0, 2, 3, 4, 6, 7, 8, 9], "tool": [0, 1, 3, 4, 6], "throughout": [0, 4, 5, 6, 7, 8], "tackl": [0, 3, 5, 6, 8], "follow": [0, 3, 4, 5, 6, 7, 8, 9], "non": [0, 3, 6, 7, 8, 9], "exhaust": [0, 6, 7], "list": [0, 3, 5, 6, 7, 8, 9], "structur": [0, 3, 4, 5, 7, 8], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7, 8, 9], "struggl": [0, 1, 3, 5, 6, 7, 8, 9], "maintain": [0, 1, 3, 4, 5, 6, 7, 8, 9], "consist": [0, 1, 3, 4, 5, 6, 7, 8, 9], "output": [0, 1, 3, 5, 6, 7, 8], "format": [0, 3, 4, 5, 6, 7, 8, 9], "complic": [0, 8], "integr": [0, 1, 3, 4, 5, 6, 7, 8, 9], "larger": [0, 3, 4, 5, 6, 7, 8, 9], "make": [0, 3, 4, 5, 6, 7, 8, 9], "error": [0, 3, 5, 6, 8, 9], "handl": [0, 3, 4, 5, 6, 7, 8, 9], "more": [0, 1, 3, 5, 6, 7, 8, 9], "input": [0, 3, 5, 7, 8, 9], "data": [0, 1, 4, 5, 7, 8, 9], "manag": [0, 1, 4, 5, 7, 8, 9], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9], "sensit": [0, 3, 4, 5, 6, 7, 8], "oper": [0, 3, 5, 6, 7, 8, 9], "stale": [0, 6], "long": [0, 1, 3, 4, 5, 7, 8, 9], "context": [0, 1, 3, 4, 5, 6, 7, 8, 9], "requir": [0, 3, 6, 7, 8, 9], "care": [0, 3, 4, 5, 6, 7, 8, 9], "retriev": [0, 4, 5, 7], "strategi": [0, 3, 4, 5, 6, 7, 8, 9], "tradit": [0, 3, 6, 7, 8], "methodologi": [0, 3, 5, 7, 8, 9], "break": [0, 1, 3, 4, 5, 6, 8], "down": [0, 1, 4, 5, 6, 7, 8], "deal": [0, 3, 6, 7], "determinist": [0, 6, 9], "gener": [0, 1, 4, 7, 9], "new": [0, 2, 3, 4, 5, 6, 7, 8, 9], "safeti": [0, 3, 5, 9], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9], "harm": [0, 3, 5, 7], "bias": [0, 3, 5, 6, 7, 8, 9], "inappropri": [0, 3, 8], "safeguard": [0, 5, 8], "monitor": [0, 3, 4, 5, 6, 7, 8], "ensur": [0, 3, 4, 5, 6, 7, 8, 9], "safe": [0, 3, 5, 8, 9], "deploy": [0, 3, 4, 5, 8, 9], "align": [0, 4, 5, 6, 7, 8, 9], "next": [0, 1, 3, 4, 5, 6, 7, 8, 9], "token": [0, 1, 3, 4, 5, 6, 7, 8, 9], "predict": [0, 1, 3, 5, 6, 7, 8, 9], "mean": [0, 3, 4, 5, 6, 7, 8, 9], "thei": [0, 1, 3, 4, 5, 6, 7, 8, 9], "user": [0, 1, 4, 5, 6, 7, 9], "": [0, 1, 3, 4, 5, 6, 7, 8, 9], "prefer": [0, 5, 6, 7, 8, 9], "default": [0, 3, 5, 6, 7, 8, 9], "vendor": [0, 4, 5, 7], "lock": [0, 3, 4, 7], "cloud": [0, 3, 4, 5, 6, 7, 8, 9], "base": [0, 1, 4, 7, 9], "provid": [0, 2, 3, 4, 5, 6, 7, 8, 9], "creat": [0, 1, 3, 4, 5, 6, 7, 8, 9], "signific": [0, 3, 4, 5, 6, 7, 8, 9], "depend": [0, 3, 4, 5, 6, 7, 9], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "proprietari": [0, 3, 7, 8, 9], "infrastructur": [0, 4, 7], "difficult": [0, 3, 5, 6, 8], "switch": [0, 7], "self": [0, 3, 5, 6, 7, 8, 9], "host": [0, 4, 5, 7, 8], "cost": [0, 3, 5, 6, 8, 9], "optim": [0, 1, 5, 6, 7, 8], "The": [0, 1, 3, 6, 8, 9], "financi": [0, 1, 3, 4, 5, 6, 8, 9], "quickli": [0, 3, 4, 6, 7], "becom": [0, 3, 4, 5, 6, 7, 8, 9], "prohibit": [0, 3, 5, 6, 7], "without": [0, 1, 3, 4, 5, 6, 7, 8, 9], "conclud": [0, 5, 6, 7], "discuss": [0, 4, 5, 7, 8], "futur": [0, 3, 4, 5, 7, 8], "aris": [0, 3, 5, 6, 8], "move": [0, 3, 4, 5, 6, 7, 8], "forward": [0, 3, 5, 8], "take": [0, 2, 3, 4, 5, 6, 7, 8, 9], "hand": [0, 6, 7, 8, 9], "focu": [0, 2, 3, 4, 5, 6, 7, 8, 9], "access": [0, 3, 4, 5, 6, 7, 8, 9], "all": [0, 1, 3, 4, 5, 6, 7, 8, 9], "fulli": [0, 3, 5, 8], "document": [0, 3, 4, 5, 7, 8, 9], "allow": [0, 5, 6, 7, 8, 9], "reader": [0, 2, 6, 8], "replic": [0, 5, 6, 8, 9], "result": [0, 3, 4, 5, 6, 8, 9], "exactli": [0, 5, 6, 9], "design": [0, 1, 3, 6, 7, 9], "run": [0, 3, 4, 5, 6, 7, 8, 9], "consum": [0, 3, 4, 5, 6, 7, 8, 9], "grade": [0, 3, 4, 5, 6, 7, 8], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7, 8], "avail": [0, 3, 4, 5, 6, 7, 8, 9], "notebook": [0, 3, 6, 9], "modifi": [0, 3, 5, 8, 9], "extend": [0, 3, 4, 5, 6, 7, 9], "minim": [0, 3, 4, 5, 6, 7, 8, 9], "effect": [0, 1, 3, 4, 5, 6, 8, 9], "framework": [0, 3, 4, 5, 7], "wai": [0, 3, 4, 5, 6, 7, 8, 9], "priorit": [0, 3, 5, 6, 7, 8], "transpar": [0, 3, 4, 5, 7, 8], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8, 9], "better": [0, 2, 3, 4, 5, 6, 7, 8, 9], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "custom": [0, 3, 5, 6, 9], "flexibl": [0, 4, 5, 6, 7, 8, 9], "adapt": [0, 3, 4, 5, 7, 8], "us": [0, 1, 3, 4, 7, 8, 9], "case": [0, 4, 5, 9], "unlik": [0, 3, 5, 7], "black": [0, 3], "box": [0, 7], "commerci": [0, 5, 7, 8, 9], "most": [0, 3, 4, 5, 6, 7, 8, 9], "freeli": [0, 9], "foster": [0, 3, 5, 8, 9], "reduc": [0, 3, 4, 5, 6, 7, 8, 9], "independ": [0, 5, 6, 8, 9], "freedom": [0, 7, 9], "architectur": [0, 3, 4, 5, 6, 7, 9], "decis": [0, 3, 4, 5, 6, 7, 8], "keep": [0, 3, 5, 6, 7, 8], "principl": [0, 3, 5, 7, 8], "itself": [0, 3, 5, 6, 7, 8], "live": [0, 1, 5, 6, 8], "evolv": [0, 4, 5, 6, 7, 8], "chang": [0, 3, 5, 6, 7, 8], "encourag": [0, 3, 5, 6, 8, 9], "report": [0, 3, 5, 6, 7, 8, 9], "suggest": [0, 3, 5, 6, 7, 8, 9], "improv": [0, 3, 4, 5, 6, 7, 8, 9], "contribut": [0, 4, 5, 6, 7, 8], "via": [0, 3, 4, 5, 6, 7, 8, 9], "pull": [0, 7], "request": [0, 3, 4, 5, 6, 7, 8, 9], "share": [0, 3, 5, 6, 7, 8, 9], "own": [0, 3, 4, 5, 6, 7, 8], "experi": [0, 3, 4, 5, 6, 7, 8, 9], "commun": [0, 3, 4, 5, 6, 8, 9], "propos": [0, 4, 5, 6, 8], "chapter": [0, 3, 4, 5, 6, 7, 8, 9], "section": [0, 3, 4, 5, 6, 7, 8, 9], "found": [0, 3, 4, 5, 7, 9], "http": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "com": [0, 2, 3, 4, 5, 6, 7, 8, 9], "souzatharsi": [0, 2, 3, 4, 5, 6, 7, 8, 9], "tamingllm": [0, 2, 3, 4, 5, 6, 7, 8, 9], "whether": [0, 3, 4, 5, 6, 7, 8, 9], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9], "ve": [0, 7], "typo": [0, 8], "want": [0, 1, 3, 6, 7, 8, 9], "welcom": 0, "pleas": [0, 3, 5, 7, 8], "feel": [0, 6, 7], "free": [0, 1, 3, 5, 6, 7, 8], "look": [0, 2, 3, 4, 5, 6, 7, 8], "our": [0, 1, 3, 4, 5, 6, 7, 8, 9], "goal": [0, 1, 3, 5, 6, 8, 9], "discourag": [0, 6], "enabl": [0, 3, 4, 5, 6, 7, 8, 9], "By": [0, 1, 2, 3, 5, 6, 8, 9], "upfront": [0, 2, 4], "equip": [0, 2, 5, 6, 8], "avoid": [0, 3, 5, 7, 8, 9], "current": [0, 2, 3, 4, 5, 6, 8, 9], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8, 9], "tend": [0, 2, 5, 8], "toward": [0, 3, 5, 8, 9], "extrem": [0, 3, 4, 5, 6, 8], "either": [0, 3, 5, 6, 7, 8], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8, 9], "rather": [0, 1, 3, 4, 5, 6, 7, 8], "than": [0, 1, 3, 5, 6, 7, 8, 9], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8, 9], "first": [0, 1, 3, 4, 5, 6, 7, 8, 9], "everi": [0, 4, 5, 6, 8], "concept": [0, 3, 5, 6, 8], "illustr": [0, 3, 5, 6, 7, 8, 9], "execut": [0, 5, 7, 8], "immedi": [0, 3, 4, 5, 7], "analysi": [0, 1, 3, 4, 5, 6, 7, 8], "balanc": [0, 3, 4, 5, 6, 7, 8, 9], "both": [0, 3, 4, 5, 6, 7, 8], "help": [0, 3, 4, 5, 6, 7, 8, 9], "inform": [0, 3, 4, 5, 6, 7, 8, 9], "lead": [0, 1, 3, 4, 5, 6, 7, 8, 9], "genai": [0, 1, 3, 6, 8], "initi": [0, 1, 3, 4, 5, 6, 7, 8, 9], "advoc": [0, 8], "anyon": [0, 8], "seek": [0, 5, 6, 7, 8], "work": [0, 1, 3, 4, 5, 6, 7, 8, 9], "typic": [0, 3, 4, 5, 6, 7, 8, 9], "job": [0, 5, 6, 7, 8], "role": [0, 3, 5, 6, 7, 8, 9], "platform": [0, 5, 6, 7, 8, 9], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 7], "ml": [0, 6, 8], "transit": [0, 4, 5, 7, 9], "overse": 0, "motiv": [0, 3, 4, 5, 6, 9], "need": [0, 3, 4, 5, 6, 7, 8, 9], "readi": [0, 5, 6, 8], "desir": [0, 1, 3, 5, 6, 9], "perform": [0, 3, 5, 6, 8, 9], "earli": [0, 3, 4, 5, 6, 8, 9], "befor": [0, 3, 4, 5, 6, 8, 9], "costli": [0, 5, 6, 8], "problem": [0, 1, 2, 3, 4, 6, 7, 8], "too": [0, 1, 3, 5, 7, 8], "late": [0, 3, 4, 8, 9], "lifecycl": [0, 7, 8], "after": [0, 1, 3, 5, 6, 7, 8, 9], "read": [0, 3, 4, 5, 6, 8, 9], "implic": [0, 1, 3, 5, 8], "recommend": [0, 3, 5, 6, 7, 8, 9], "abl": [0, 3, 5, 9], "deploi": [0, 3, 5, 7, 8], "proper": [0, 3, 4, 7, 8, 9], "realist": [0, 3, 4, 8], "effort": [0, 5, 7, 8, 9], "estim": [0, 4, 5, 6, 8], "project": [0, 3, 4, 5, 6, 7, 8], "impact": [0, 3, 4, 5, 6, 7, 8, 9], "timelin": 0, "To": [0, 3, 5, 7, 8, 9], "should": [0, 3, 4, 5, 6, 7, 8, 9], "basic": [0, 3, 5, 6, 7, 8], "program": [0, 5, 6, 7, 9], "knowledg": [0, 3, 5, 7, 8], "mistral": [0, 3, 9], "openai": [0, 3, 5, 6, 7, 9], "anthrop": [0, 3, 6, 9], "similar": [0, 3, 4, 5, 6, 7, 9], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8, 9], "get": [0, 3, 4, 5, 6, 7, 8, 9], "start": [0, 3, 4, 5, 6, 7, 8, 9], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7, 8], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8, 9], "venv": [0, 9], "tame": [0, 3, 4, 5, 6, 7, 8, 9], "env": [0, 3, 5, 6, 8, 9], "bin": [0, 7], "On": [0, 5, 6, 7, 9], "window": [0, 4, 5, 6, 7], "script": [0, 7], "try": [0, 1, 3, 5, 6, 8, 9], "each": [0, 3, 4, 5, 6, 7, 8, 9], "contain": [0, 3, 4, 5, 6, 7, 8, 9], "possibl": [0, 3, 4, 5, 6, 7, 8, 9], "includ": [0, 1, 3, 4, 5, 6, 7, 8, 9], "necessari": [0, 3, 4, 5, 8], "instal": [0, 3, 5, 7, 9], "go": [0, 3, 5, 6, 9], "packag": [0, 4, 5, 6, 7, 9], "e": [0, 1, 3, 4, 5, 6, 7, 8, 9], "g": [0, 3, 4, 5, 6, 7, 8, 9], "pip": [0, 3, 5, 7, 9], "poetri": [0, 8], "file": [0, 3, 5, 6, 7, 8, 9], "root": [0, 3], "directori": [0, 5, 6, 7], "add": [0, 3, 5, 6, 7, 8], "other": [0, 3, 4, 5, 6, 7, 8, 9], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 9], "commit": [0, 3, 5, 8], "version": [0, 3, 4, 5, 6, 7, 8, 9], "control": [0, 1, 3, 4, 5, 6, 7, 8, 9], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 6, 7, 8, 9], "encount": [0, 2, 5, 8], "rate": [0, 3, 4, 5, 6, 7, 8], "consid": [0, 3, 4, 5, 6, 7, 8, 9], "smaller": [0, 3, 4, 5, 6, 7, 9], "retri": [0, 9], "logic": [0, 1, 3, 5, 6, 8], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8, 9], "check": [0, 5, 6, 7, 8, 9], "page": [0, 5, 6, 7], "known": [0, 5, 6, 8, 9], "now": [0, 1, 3, 4, 5, 6, 7, 8, 9], "let": [0, 3, 4, 5, 6, 7, 8, 9], "begin": [0, 5, 7, 8, 9], "explor": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tharsi": [0, 2, 3, 4, 5, 6, 7, 8, 9], "souza": [0, 2, 3, 4, 5, 6, 7, 8, 9], "ph": [0, 8], "d": [0, 3, 4, 5, 6, 7, 8, 9], "scienc": [0, 3, 5, 8], "ucl": 0, "univers": [0, 5, 7, 8], "london": 0, "scientist": [0, 1, 7, 8], "special": [0, 4, 5, 6, 7, 8, 9], "he": [0, 3, 5, 6, 8], "lectur": 0, "columbia": 0, "master": [0, 4, 7, 9], "appli": [0, 3, 5, 6, 7, 8, 9], "analyt": 0, "incom": [0, 5, 6], "head": [0, 3, 5, 6, 8, 9], "equiti": [0, 5, 6], "citadel": 0, "former": [0, 1, 5, 7], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8, 9], "sigma": [0, 3], "invest": [0, 3, 4, 5, 6, 8], "mentor": 0, "under": [0, 3, 4, 5, 7, 8, 9], "repres": [0, 3, 4, 5, 6, 7, 9], "student": [0, 3, 6, 8], "profession": [0, 3, 5, 6, 8, 9], "divers": [0, 3, 4, 5, 6, 8], "global": [0, 5, 6, 8], "ecosystem": [0, 4, 5, 7], "With": [0, 3, 5, 6, 7, 8, 9], "over": [0, 2, 3, 4, 5, 6, 7, 8, 9], "15": [0, 5, 6, 7, 8, 9], "deliv": [0, 4, 5, 6, 7], "across": [0, 3, 4, 5, 6, 7, 8, 9], "startup": 0, "fortun": 0, "500": [0, 3, 5, 6, 8], "compani": [0, 3, 4, 5, 6, 8, 9], "also": [0, 3, 4, 5, 6, 7, 8, 9], "numer": [0, 4, 5, 6, 8, 9], "scholarli": 0, "frequent": [0, 5, 6, 7, 9], "speaker": [0, 5], "academ": [0, 3, 5, 8], "busi": [0, 5, 6, 7, 8], "confer": [0, 6, 9], "ground": [0, 3, 5, 6, 7], "background": [0, 1, 5, 6, 7], "draw": [0, 3, 5, 8, 9], "scale": [0, 3, 4, 5, 6, 7, 8, 9], "stage": [0, 3, 8, 9], "major": [0, 3, 4, 5, 6, 7, 8, 9], "institut": [0, 5, 8], "well": [0, 3, 4, 5, 6, 7, 8, 9], "uniqu": [0, 3, 4, 5, 6, 7, 8, 9], "bridg": [0, 7, 8], "gap": [0, 1, 3, 4, 6, 7, 8], "between": [0, 1, 3, 4, 5, 6, 7, 8, 9], "potenti": [0, 1, 3, 4, 5, 6, 7, 8, 9], "tell": [1, 3, 8], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8, 9], "someth": [1, 5, 7], "i": [1, 2, 4, 5, 7, 8, 9], "emanuel": [1, 3, 5, 8], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8, 9], "altern": [1, 3, 4, 5, 6, 7, 8], "titl": [1, 2, 3, 4, 5, 6, 7, 8, 9], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9], "book": [1, 5, 6], "could": [1, 3, 4, 5, 6, 7, 8, 9], "been": [1, 3, 4, 5, 6, 7, 8], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8, 9], "notic": [1, 3, 4, 5, 6, 8, 9], "parallel": [1, 3, 5, 7], "semin": [1, 8], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8, 9], "caution": 1, "against": [1, 3, 4, 5, 6, 7, 8], "treat": [1, 5, 8], "perfect": [1, 5, 7], "represent": [1, 5, 6, 7, 8], "realiti": [1, 6, 8], "aim": [1, 3, 4, 5, 6, 7, 8, 9], "highlight": [1, 3, 5, 6, 7, 8, 9], "practic": [1, 3, 4, 5, 6, 8], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 7], "fail": [1, 3, 5, 6, 8], "we": [1, 3, 4, 5, 6, 7, 8, 9], "mistak": [1, 8], "approxim": [1, 4, 5, 9], "full": [1, 3, 4, 5, 6, 7, 8, 9], "assumpt": [1, 5, 8], "core": [1, 4, 5, 6, 7, 8], "premis": [1, 7], "hi": [1, 5, 8, 9], "aspect": [1, 3, 5, 6, 8], "world": [1, 3, 4, 5, 6, 7, 8, 9], "inher": [1, 2, 3, 5, 8], "involv": [1, 3, 4, 5, 6, 7, 8, 9], "simplif": 1, "argu": [1, 4, 8, 9], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 8], "part": [1, 3, 4, 5, 6, 8, 9], "becaus": [1, 3, 5, 6, 8], "peopl": [1, 3, 5, 7, 8], "put": [1, 5, 7], "much": [1, 3, 5, 6, 7], "faith": 1, "mathemat": [1, 5, 6, 7, 9], "recogn": [1, 3, 5, 8], "human": [1, 4, 5, 6, 7, 8, 9], "behavior": [1, 3, 5, 7, 8], "market": [1, 4, 5, 6, 7, 9], "dynam": [1, 3, 5, 6, 8], "constraint": [1, 3, 4, 5, 6, 7, 8, 9], "hallucin": [1, 3, 5, 6, 8, 9], "fact": [1, 3, 5, 6, 8], "reason": [1, 3, 5, 6, 7, 8, 9], "Their": [1, 5, 9], "respons": [1, 4, 5, 6, 7, 8, 9], "often": [1, 3, 4, 5, 6, 7, 8, 9], "convinc": [1, 3], "probabilist": [1, 5, 9], "train": [1, 4, 5, 6, 7, 8, 9], "true": [1, 3, 4, 5, 6, 8, 9], "even": [1, 3, 4, 5, 6, 7, 8, 9], "though": [1, 3, 4, 5, 6, 7, 8, 9], "insist": 1, "machin": [1, 3, 6, 7, 8, 9], "todai": [1, 4, 7, 9], "grow": [1, 3, 5, 6, 7, 8, 9], "pervas": [1, 8], "belief": [1, 7, 8], "solv": [1, 3, 4, 5, 7, 8, 9], "ani": [1, 3, 4, 5, 6, 7, 8, 9], "content": 1, "moreov": [1, 6], "were": [1, 3, 5, 7, 8, 9], "chatbot": [1, 3, 5, 6, 7, 8], "twist": [1, 8], "wrap": [1, 6, 7, 9], "further": [1, 3, 4, 5, 6, 7, 8, 9], "daili": [1, 4, 7, 8], "life": [1, 5, 7, 8], "workflow": [1, 4, 5, 7, 8, 9], "affect": [1, 5, 6, 7, 8], "decid": [1, 3, 5, 6], "action": [1, 3, 5, 6, 8], "coupl": [1, 7], "lack": [1, 3, 5, 6, 8, 9], "pose": [1, 3, 5, 6, 8, 9], "risk": [1, 3, 4, 5, 6, 7], "still": [1, 4, 5, 6, 7, 8], "figur": [1, 5, 7], "out": [1, 3, 4, 5, 6, 7, 8, 9], "serv": [1, 3, 4, 5, 6, 8, 9], "introductori": [1, 2], "practition": [1, 4, 5, 7, 9], "builder": [1, 7], "who": [1, 3, 5, 6, 7, 8, 9], "remain": [1, 3, 4, 5, 6, 7, 8], "clear": [1, 3, 4, 5, 6, 7, 8, 9], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8, 9], "therefor": [1, 3, 5, 6, 7, 8], "end": [1, 3, 4, 5, 6, 7, 8, 9], "detail": [1, 3, 4, 5, 6, 7, 8, 9], "python": [1, 2, 5, 6, 7, 8, 9], "code": [1, 2, 3, 5, 6, 7, 8, 9], "diminish": [1, 3, 4, 5, 6], "promot": [1, 3, 5, 8], "nuanc": [1, 3, 5, 6, 7, 8, 9], "acknowledg": [1, 5, 8], "within": [1, 3, 4, 5, 6, 7, 8, 9], "trustworthi": [1, 8], "taught": 1, "u": [1, 3, 5, 6, 8, 9], "step": [1, 3, 4, 5, 6, 7, 8, 9], "where": [1, 3, 4, 5, 6, 7, 8, 9], "der11": 1, "why": [1, 3, 5, 8, 9], "confus": [1, 4, 8], "illus": 1, "disast": [1, 5], "wall": [1, 7], "street": [1, 7], "press": [1, 5, 7], "isbn": [1, 3, 5, 6], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7, 8, 9], "googl": [1, 5, 7, 9], "co": [1, 3, 4, 5, 6, 7, 8, 9], "uk": [1, 8], "id": [1, 5, 6, 7, 8, 9], "lke_cwm4wm8c": 1, "sign": [2, 5, 8], "up": [2, 3, 4, 5, 6, 7, 8], "receiv": [2, 3, 5, 7, 8, 9], "updat": [2, 3, 4, 5, 6, 7, 8, 9], "abstract": [2, 5, 6, 8, 9], "heavili": [2, 3, 4, 5, 6, 8, 9], "gloss": 2, "fundament": [2, 3, 5, 6, 7, 8, 9], "challeng": [2, 3, 4, 5, 7, 8, 9], "convers": [2, 3, 4, 5, 6, 7, 8, 9], "kei": [2, 3, 4, 6, 7, 8, 9], "proven": [2, 4], "yet": [2, 3, 4, 5, 6, 7, 8, 9], "concret": [2, 4, 8, 9], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7, 8, 9], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7, 8, 9], "author": [2, 3, 4, 5, 6, 7, 8, 9], "t": [2, 3, 4, 5, 6, 7, 8, 9], "p": [2, 3, 4, 5, 6, 7, 8, 9], "2024": [2, 3, 4, 5, 6, 8, 9], "journal": [2, 3, 4, 5, 6, 7, 8, 9], "repositori": [2, 3, 4, 5, 6, 7, 8, 9], "valu": [3, 5, 6, 7, 8, 9], "its": [3, 4, 5, 6, 7, 8, 9], "privileg": 3, "abov": [3, 5, 6, 8], "soon": [3, 9], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8], "3": [3, 4, 5, 6, 7, 9], "5": [3, 4, 5, 6, 7, 9], "2022": [3, 5, 7, 8], "mark": [3, 5, 6, 7, 8], "moment": [3, 8], "histori": [3, 4, 5, 6, 7], "artifici": [3, 5, 7, 8], "intellig": [3, 5, 6, 7, 8], "five": [3, 5, 8], "dai": [3, 4, 5, 6, 7, 8, 9], "launch": [3, 5, 8], "attract": [3, 5], "million": [3, 4, 5, 6, 7], "month": [3, 4, 5, 7, 8], "becam": [3, 4], "fastest": [3, 5, 8], "100": [3, 4, 5, 7, 8, 9], "monthli": [3, 4, 5], "rais": [3, 4, 5, 8], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8, 9], "did": [3, 5, 6, 9], "observ": [3, 4, 5, 6, 7, 8, 9], "dramat": [3, 4, 5, 7, 9], "traction": [3, 7], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8, 9], "had": [3, 5, 8], "same": [3, 5, 6, 7, 8, 9], "size": [3, 5, 6, 7, 8, 9], "number": [3, 4, 5, 6, 7, 8, 9], "paramet": [3, 4, 5, 6, 7, 8, 9], "far": [3, 4, 7, 8], "less": [3, 4, 5, 6, 7, 8], "attent": [3, 4, 6, 7], "arguabl": [3, 5, 6, 7], "feedback": [3, 5, 8, 9], "abil": [3, 4, 5, 6, 7, 8, 9], "breakthrough": [3, 7, 8], "demonstr": [3, 4, 5, 6, 7, 8, 9], "crucial": [3, 4, 6, 7, 8, 9], "greater": [3, 5, 6, 7, 8], "process": [3, 4, 5, 6, 7, 8], "modern": [3, 5, 6, 9], "techniqu": [3, 4, 5, 6, 7], "direct": [3, 5, 7, 8], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8, 9], "al": [3, 4, 5, 6, 7, 8, 9], "present": [3, 5, 6, 7, 8, 9], "autom": [3, 4, 5, 8, 9], "fashion": [3, 9], "open": [3, 4, 5, 6, 8, 9], "sourc": [3, 4, 5, 6, 8, 9], "common": [3, 4, 5, 6, 7, 9], "pre": [3, 4, 5, 6, 7, 8, 9], "state": [3, 5, 6, 7, 8, 9], "art": [3, 5, 8], "object": [3, 4, 5, 6, 7, 8, 9], "veri": [3, 4, 5, 6, 7, 8], "ask": [3, 5, 6, 7, 8, 9], "instruct": [3, 4, 5, 6, 7, 8, 9], "sai": [3, 9], "ouyang": [3, 8], "2": [3, 4, 5, 6, 9], "explain": [3, 5, 6], "moon": 3, "land": [3, 5, 7], "6": [3, 4, 5, 6, 7], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8, 9], "pipelin": [3, 4, 5, 7, 8, 9], "pipe": [3, 8], "text": [3, 4, 5, 6, 7, 8, 9], "gpt2": [3, 5], "msg": [3, 6], "short": [3, 5, 6, 8, 9], "sentenc": [3, 5, 6, 8], "_": [3, 5, 8, 9], "rang": [3, 4, 5, 6, 7, 8, 9], "len": [3, 5, 6, 7, 8, 9], "print": [3, 4, 5, 6, 7, 8, 9], "f": [3, 4, 5, 6, 7, 8, 9], "n": [3, 5, 6, 7, 8, 9], "1": [3, 4, 5, 6, 7, 9], "0": [3, 4, 5, 6, 7, 8, 9], "generated_text": [3, 9], "good": [3, 5, 6, 7, 9], "idea": [3, 4, 7, 8, 9], "one": [3, 4, 5, 6, 7, 8, 9], "those": [3, 5, 6, 8, 9], "littl": [3, 5], "green": [3, 6, 8], "dot": [3, 4, 6], "Then": [3, 4, 5, 6], "line": [3, 5, 6, 7, 8], "later": [3, 5, 6, 7, 8, 9], "re": [3, 4, 5, 6, 7, 8, 9], "alreadi": [3, 5, 6, 9], "movi": 3, "theori": [3, 5, 6], "some": [3, 5, 6, 7, 8, 9], "word": [3, 4, 5, 6, 8, 9], "tepid": 3, "articl": [3, 5, 7, 8], "sure": [3, 5, 6, 8, 9], "lunar": 3, "As": [3, 4, 5, 6, 7, 8, 9], "see": [3, 4, 5, 6, 7, 8, 9], "coher": [3, 5, 6, 7, 9], "explan": [3, 5, 8, 9], "child": [3, 5, 8], "complet": [3, 5, 6, 7, 8, 9], "instead": [3, 4, 5, 6, 7, 8, 9], "second": [3, 4, 5, 6, 7, 8], "nonsens": [3, 8], "meander": 3, "unrel": [3, 5, 8], "topic": [3, 5, 6, 7, 8, 9], "simpl": [3, 5, 6, 7, 8, 9], "appropri": [3, 4, 5, 6, 7, 8, 9], "young": [3, 5, 8], "given": [3, 4, 5, 6, 7, 8, 9], "sequenc": [3, 5, 6, 7, 9], "address": [3, 4, 5, 6, 7, 8, 9], "issu": [3, 5, 6, 8, 9], "introduc": [3, 5, 6, 7, 8, 9], "rlhf": [3, 4, 8, 9], "intent": [3, 8], "wide": [3, 4, 5, 6, 7, 8, 9], "task": [3, 4, 6, 8, 9], "fig": [3, 4, 5, 6, 7, 8, 9], "7": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 6, 7, 8, 9], "sampl": [3, 6, 7, 9], "label": [3, 5, 7, 8, 9], "comparison": [3, 6], "reward": [3, 5, 7, 8], "sever": [3, 4, 5, 6, 7, 8, 9], "rank": [3, 5, 6, 7, 8], "best": [3, 4, 5, 6, 7, 8], "worst": 3, "rm": [3, 7], "reinforc": [3, 5, 7, 8], "write": [3, 5, 6, 7, 8, 9], "stori": [3, 8], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8, 9], "score": [3, 4, 5, 6, 7, 8, 9], "ppo": [3, 7], "proxim": [3, 7], "iter": [3, 5, 6, 7, 8, 9], "accur": [3, 4, 5, 6, 7, 8], "undesir": [3, 8], "simplifi": [3, 5, 6, 7, 9], "view": [3, 5, 6, 8], "show": [3, 4, 5, 6, 7, 8, 9], "progress": [3, 4, 8], "pattern": [3, 4, 5, 6, 7, 8, 9], "ha": [3, 4, 5, 6, 7, 8, 9], "instanc": [3, 4, 5, 6, 7, 8], "directli": [3, 4, 5, 6, 7, 8, 9], "For": [3, 4, 5, 6, 7, 8, 9], "guard": 3, "team": [3, 5, 6, 7, 9], "8b": [3, 7, 8, 9], "wa": [3, 4, 5, 6, 7, 8, 9], "classif": [3, 5, 6, 7, 8, 9], "bypass": [3, 8], "similarli": [3, 4, 5, 7, 8], "zephyr": 3, "7b": [3, 5, 7, 8, 9], "alpha": [3, 5, 9], "huggingfac": [3, 4, 5, 6, 7, 8, 9], "publicli": [3, 5, 9], "assist": [3, 5, 6, 7, 8, 9], "paper": [3, 5, 7, 8, 9], "compon": [3, 5, 6, 7], "particular": [3, 4, 5, 6, 7, 8, 9], "foundat": [3, 4, 5, 6, 7, 8], "advanc": [3, 4, 5, 6, 7, 8, 9], "method": [3, 5, 6, 8, 9], "strong": [3, 5, 6, 7, 8, 9], "At": [3, 4, 5, 6, 7, 9], "high": [3, 4, 5, 6, 7, 8, 9], "level": [3, 4, 5, 6, 8, 9], "carefulli": [3, 4, 5, 6, 7, 8, 9], "curat": [3, 5, 7], "purpos": [3, 5, 6, 7, 8, 9], "exhibit": [3, 5, 7, 8], "domain": [3, 4, 5, 6, 7, 8], "emploi": [3, 5, 6, 8, 9], "prove": [3, 5, 6, 8], "particularli": [3, 4, 5, 6, 7, 8, 9], "valuabl": [3, 5, 6, 7, 9], "scenario": [3, 5, 7, 8, 9], "precis": [3, 4, 5, 6, 7, 8, 9], "style": [3, 5], "tone": 3, "expertis": [3, 5, 6, 8], "medic": [3, 5, 7], "legal": [3, 5, 6, 7, 8], "field": [3, 5, 6, 7, 8, 9], "adher": [3, 5, 6, 8, 9], "guidelin": [3, 5, 8], "servic": [3, 4, 5, 6, 7, 8], "standard": [3, 4, 5, 6, 7, 8], "approach": [3, 5, 6, 7, 9], "distinct": [3, 5, 7, 8, 9], "advantag": [3, 4, 5, 6, 7, 8, 9], "weight": [3, 4, 5, 6, 7, 8, 9], "maximum": [3, 5, 6, 7, 8], "lora": [3, 7, 8], "low": [3, 4, 5, 6, 7, 8, 9], "hu": [3, 6, 8, 9], "2021": [3, 4, 5, 6], "small": [3, 4, 5, 6, 7, 9], "matric": 3, "effici": [3, 4, 5, 6, 7, 8, 9], "qlora": 3, "quantiz": [3, 6], "dettmer": 3, "2023": [3, 4, 5, 6, 7, 8, 9], "combin": [3, 4, 5, 6, 7, 8, 9], "memori": [3, 4, 5, 6, 7, 8], "footprint": [3, 4, 6, 7], "modest": [3, 7], "increas": [3, 4, 5, 6, 7, 8, 9], "likelihood": [3, 5, 6, 8, 9], "obtain": [3, 5, 6, 7, 8, 9], "probabl": [3, 5, 7, 9], "outcom": [3, 5, 8, 9], "hong": [3, 5], "unintend": [3, 8], "suboptim": 3, "seen": [3, 5, 6, 8], "form": [3, 4, 5, 7, 8, 9], "research": [3, 4, 5, 6, 7], "maxim": [3, 5, 6], "shown": [3, 5, 6, 7, 8], "alon": [3, 5, 6, 7, 8], "gain": [3, 4, 5, 7, 8], "achiev": [3, 4, 5, 6, 7, 8, 9], "bai": [3, 5, 8], "touvron": [3, 7], "schulman": [3, 8], "2017": [3, 5], "algorithm": [3, 5, 8], "popular": [3, 6, 7, 9], "sinc": [3, 4, 5, 6, 7, 8, 9], "understood": [3, 6], "set": [3, 4, 5, 6, 7, 8, 9], "rule": [3, 5, 6, 7, 9], "govern": [3, 5, 6], "reflect": [3, 5, 6, 7, 8], "anoth": [3, 5, 6, 7, 8], "adjust": [3, 5, 7, 8, 9], "One": [3, 4, 5, 6, 7, 8, 9], "strength": [3, 5, 6, 7, 8], "2024c": [3, 7], "real": [3, 4, 5, 6, 7, 8, 9], "noisi": 3, "delai": [3, 5, 7, 8], "subsequ": [3, 6, 9], "situat": [3, 5, 6, 8], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8, 9], "stabl": [3, 5, 6], "prevent": [3, 4, 5, 8, 9], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7, 8], "simplic": [3, 7], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 7, 8, 9], "4": [3, 4, 5, 6, 7, 9], "fit": [3, 4, 5, 6, 8, 9], "pair": [3, 5, 6, 8], "rl": [3, 8], "find": [3, 4, 5, 6, 7, 8, 9], "contrast": [3, 4, 5, 6, 7, 8, 9], "satisfi": [3, 5], "implicit": [3, 5, 6, 8], "whose": [3, 5], "correspond": [3, 5, 6, 9], "extract": [3, 4, 5, 7, 8, 9], "close": [3, 5, 6, 7, 8], "compar": [3, 4, 5, 6, 7, 8], "assign": [3, 5, 6, 7, 8, 9], "higher": [3, 4, 5, 6, 7, 9], "kl": [3, 7], "diverg": [3, 7], "origin": [3, 4, 5, 6, 7, 8, 9], "preserv": [3, 6, 7, 8, 9], "defin": [3, 4, 5, 6, 7, 8, 9], "equat": 3, "mathcal": 3, "l": [3, 5, 6], "pi_": 3, "theta": [3, 9], "ref": 3, "mathbb": [3, 9], "x": [3, 5, 6, 7, 8, 9], "y_w": 3, "y_l": 3, "sim": [3, 9], "left": [3, 6, 7], "log": [3, 4, 5, 7], "beta": [3, 5, 6, 8, 9], "frac": [3, 7, 8], "right": [3, 5, 6, 7, 8], "respect": [3, 5, 6, 7, 8], "deviat": [3, 5, 7, 8], "straightforward": [3, 5, 6, 7, 8, 9], "librari": [3, 4, 5, 6, 7, 8, 9], "trl": [3, 7, 8], "2024d": [3, 7], "suit": [3, 5, 8], "friendli": [3, 5, 7], "interfac": [3, 4, 5, 6, 7, 8, 9], "featur": [3, 5, 6, 7, 8, 9], "distinguish": [3, 5, 8], "scalabl": [3, 5, 6, 8], "doe": [3, 5, 6, 7, 8, 9], "pretrain": [3, 5, 6, 7], "hou": [3, 5, 7], "poor": [3, 5, 6, 8], "return": [3, 4, 5, 6, 7, 8, 9], "addit": [3, 4, 5, 6, 7, 8, 9], "benefit": [3, 4, 5, 6, 7, 8, 9], "fix": [3, 5, 6, 7, 8], "invers": 3, "trend": [3, 4, 5, 6, 8], "util": [3, 4, 5, 6, 7, 8], "rapid": [3, 5, 6, 7, 8], "yield": [3, 4, 5, 6], "onli": [3, 4, 5, 6, 7, 8, 9], "margin": [3, 5, 6, 8, 9], "capit": [3, 5, 6, 9], "inaccuraci": [3, 5, 6], "nois": 3, "dure": [3, 4, 5, 6, 7, 8, 9], "accuraci": [3, 4, 5, 6, 7, 8, 9], "lag": [3, 5, 8], "significantli": [3, 4, 5, 6, 7, 8], "indic": [3, 5, 6, 7, 8, 9], "signal": [3, 6, 8], "plateau": 3, "sophist": [3, 5, 6, 7, 8], "previou": [3, 5, 6, 7, 9], "deriv": [3, 5, 6, 7], "pairwis": [3, 5], "feng": [3, 8], "substanti": [3, 4, 5, 6, 7, 8], "wors": [3, 6, 7, 9], "influenc": [3, 5, 6, 8, 9], "success": [3, 4, 5, 6, 7, 8, 9], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 6, 7, 8], "loss": [3, 4, 5, 6, 7, 8], "gradient": [3, 5, 8], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7, 8], "These": [3, 4, 5, 6, 7, 8, 9], "phenomenon": [3, 8, 9], "degrad": [3, 4, 5, 6, 7, 8, 9], "danger": [3, 7, 8], "loop": [3, 5, 7, 8], "recurs": [3, 6], "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8, 9], "pollut": 3, "replac": [3, 5, 6, 7], "amplif": 3, "reduct": [3, 4, 5, 6, 7], "express": [3, 4, 5, 6, 8, 9], "catastroph": [3, 6, 8], "forget": [3, 6, 9], "previous": [3, 5, 6, 8, 9], "mitig": [3, 4, 5, 6, 7, 8, 9], "organ": [3, 4, 5, 6, 7], "mix": [3, 5, 6, 8, 9], "metric": [3, 6, 7, 8], "sz\u00e9p": 3, "guidanc": [3, 9], "regular": [3, 5, 7, 8, 9], "insight": [3, 4, 5, 6, 7, 8, 9], "relev": [3, 4, 5, 6, 7, 8], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8, 9], "compli": [3, 4, 5, 6, 7, 8, 9], "modif": [3, 5, 7, 8], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 8], "2024a": [3, 6, 7, 9], "dec": 3, "explicitli": [3, 5, 7], "so": [3, 4, 5, 6, 8, 9], "might": [3, 4, 5, 6, 7, 8, 9], "pretend": 3, "adopt": [3, 5, 7, 8, 9], "actual": [3, 5, 6, 7, 8, 9], "onc": [3, 5, 6, 7, 8], "describ": [3, 5, 7, 8], "harmless": [3, 8], "told": 3, "retrain": [3, 7], "queri": [3, 5, 6], "tier": [3, 4, 5, 8], "paid": [3, 5, 6], "column": [3, 5, 6, 8], "condit": [3, 5, 6, 9], "toxic": [3, 7, 8], "excerpt": [3, 5, 7], "scratchpad": 3, "refus": [3, 8, 9], "happen": [3, 6, 8], "bomb": [3, 8], "engag": [3, 4, 5, 6, 7, 8, 9], "intern": [3, 5, 6, 8], "unmonitor": 3, "longer": [3, 5, 7], "believ": [3, 5, 7, 8, 9], "act": [3, 5, 6, 7, 8, 9], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7, 8], "complianc": [3, 4, 5, 6, 7, 8], "phase": [3, 4, 5, 7, 9], "natur": [3, 5, 6, 7, 8, 9], "evid": [3, 5, 6, 7, 8, 9], "seemingli": [3, 6], "surpris": 3, "appear": [3, 5, 6, 8, 9], "criteria": [3, 5, 8], "underli": [3, 5, 6, 8, 9], "anim": [3, 8], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8, 9], "explicit": [3, 5, 7, 8, 9], "chain": [3, 5, 6], "thought": [3, 5, 6, 7, 9], "opaqu": 3, "opu": 3, "sonnet": [3, 5, 7], "wherea": [3, 5], "haiku": [3, 8], "persist": [3, 4, 6], "resist": [3, 5], "embed": [3, 4, 5, 6, 7], "doesn": [3, 5, 6, 7, 9], "anti": [3, 5], "lab": 3, "exfiltr": [3, 8], "protect": [3, 4, 5, 7, 8], "Not": [3, 5, 6, 8], "malici": [3, 5, 8], "support": [3, 5, 6, 8, 9], "concern": [3, 5, 6, 7, 8], "mechan": [3, 4, 5, 6, 7, 8, 9], "insuffici": [3, 5], "don": [3, 5, 6, 9], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8, 9], "detect": [3, 5, 8, 9], "decept": [3, 5, 8], "warrant": [3, 8], "deeper": [3, 5, 6], "scrutini": [3, 5, 8], "reli": [3, 5, 6, 8, 9], "cross": [3, 5, 6, 7, 8], "circular": 3, "bia": [3, 5, 8, 9], "truli": [3, 5, 7], "trust": [3, 5, 6, 8, 9], "referenti": 3, "ly": 3, "hood": [3, 9], "deep": [3, 5, 6, 8, 9], "mechanist": 3, "drive": [3, 4, 8, 9], "correl": [3, 4, 5, 7], "miss": [3, 5, 6, 8], "confound": 3, "factor": [3, 4, 5, 6, 7, 9], "establish": [3, 4, 5, 7, 8], "attempt": [3, 5, 8, 9], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 6, 8], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "internet": [3, 5], "henc": [3, 4, 5, 6, 7, 8, 9], "agenc": [3, 5, 6, 8], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 6, 8, 9], "mode": [3, 7, 8], "map": [3, 4, 5, 6, 7, 9], "cleanli": 3, "analogi": 3, "interest": [3, 4, 5, 6, 7, 8, 9], "empir": 3, "excel": [3, 5, 6, 7, 8, 9], "review": [3, 4, 5, 6, 7, 8, 9], "prof": [3, 8], "jacob": [3, 5, 6, 7, 8], "andrea": [3, 5, 8], "yoshua": [3, 6, 8], "bengio": [3, 6, 8], "jasjeet": 3, "sekhon": [3, 6], "dr": 3, "rohin": 3, "shah": 3, "2024b": [3, 6, 7, 9], "assum": [3, 5, 6, 8], "acm": [3, 6, 8], "inc": [3, 5, 6, 9], "dedic": [3, 5, 6, 7, 8], "democrat": [3, 4, 5, 6, 7, 9], "educ": [3, 5, 6], "k": [3, 5, 6, 8, 9], "12": [3, 4, 5, 6, 7, 8], "name": [3, 4, 5, 6, 7, 8, 9], "smolk": 3, "ll": [3, 5, 7], "walk": 3, "measur": [3, 4, 5, 6, 7, 8], "huggingfacetb": [3, 9], "360m": [3, 5, 7], "compact": [3, 5, 6, 7, 8], "famili": [3, 8, 9], "publish": [3, 6, 8, 9], "api": [3, 4, 5, 6, 7, 9], "local": [3, 4, 5, 6, 8, 9], "infer": [3, 4, 5, 6, 7, 8, 9], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8, 9], "store": [3, 4, 5, 6, 8], "eventu": [3, 5, 7], "final": [3, 5, 6, 8, 9], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 8], "worth": [3, 4, 5, 6, 7, 9], "choic": [3, 5, 6, 7, 8, 9], "lightweight": [3, 4, 5, 7, 9], "suitabl": [3, 5, 6, 8], "devic": [3, 4, 5, 7, 9], "Its": [3, 5, 7], "candid": [3, 5, 6, 7], "main": [3, 5, 6, 7, 8, 9], "said": [3, 5, 6, 8], "necessarili": [3, 4, 5, 7, 8], "par": [3, 5, 7], "mind": [3, 5, 7, 8, 9], "along": [3, 4, 5, 7, 8], "factual": [3, 5, 6, 7, 8], "inconsist": [3, 5, 8], "guardrail": [3, 8], "articul": 3, "uphold": [3, 8], "employe": [3, 5, 6], "stakehold": [3, 5, 6, 8], "expect": [3, 4, 5, 6, 7, 8, 9], "regard": [3, 5, 7, 8], "ethic": [3, 5, 7, 8], "conduct": [3, 5], "social": [3, 5, 8], "mission": [3, 8], "vision": [3, 5, 7, 8], "cultur": [3, 5, 7, 8], "account": [3, 4, 5, 8], "codifi": 3, "benchmark": [3, 6], "mlcommon": 3, "vidgen": [3, 8], "encompass": [3, 4, 8, 9], "seven": 3, "hazard": [3, 5, 8], "categori": [3, 5, 6, 7, 8, 9], "violent": [3, 8], "crime": [3, 8], "sex": [3, 8], "relat": [3, 4, 5, 6, 7, 8, 9], "sexual": [3, 8], "exploit": [3, 4, 5, 8], "indiscrimin": [3, 8], "weapon": [3, 8], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 8], "cbrne": 3, "suicid": [3, 8], "hate": [3, 8], "speech": [3, 8], "below": [3, 5, 6, 7, 8, 9], "markdown": [3, 5, 6, 7, 8, 9], "written": [3, 5, 6], "english": [3, 4], "o": [3, 5, 6, 8, 9], "ipython": [3, 5, 6, 8], "displai": [3, 5, 6, 8, 9], "def": [3, 5, 6, 8, 9], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7, 8], "join": [3, 5, 6, 8], "genai_polici": 3, "md": [3, 5, 6, 7, 8, 9], "r": [3, 5, 6, 7, 8, 9], "policy_cont": 3, "classroom": [3, 8], "accept": [3, 5, 6, 7, 8], "unaccept": [3, 7], "ag": [3, 5, 8], "subject": [3, 5, 7], "posit": [3, 4, 5, 6, 7, 8, 9], "confid": [3, 5, 6], "inclus": [3, 5, 6, 8, 9], "celebr": 3, "definit": [3, 4, 5, 6, 9], "creativ": [3, 4, 5, 7, 9], "math": [3, 5, 7], "tip": [3, 8], "digit": [3, 4, 5, 6], "literaci": 3, "onlin": [3, 4, 5, 7, 8, 9], "histor": [3, 5, 6], "violenc": [3, 8], "physic": [3, 5, 8], "fight": [3, 8], "crimin": [3, 8], "illeg": [3, 8], "glorifi": [3, 8], "person": [3, 5, 6, 7, 8, 9], "eat": [3, 8], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 8], "discriminatori": [3, 8], "bulli": [3, 8], "harass": [3, 5, 8], "target": [3, 4, 5, 7, 8, 9], "group": [3, 5, 6, 7, 8], "religi": [3, 7, 8], "racial": [3, 5, 8], "ethnic": [3, 8], "gender": [3, 5, 8], "discrimin": [3, 5, 6, 8], "adult": [3, 8], "profan": [3, 8], "relationship": [3, 5, 6], "substanc": [3, 5], "drug": [3, 8], "gambl": 3, "bet": 3, "protocol": [3, 5, 8], "redirect": 3, "alert": [3, 4], "record": [3, 5, 7, 8], "audit": [3, 4, 5, 6], "teacher": [3, 8], "parent": [3, 8], "continu": [3, 4, 5, 6, 7, 8, 9], "construct": [3, 5, 6, 7, 8, 9], "compliant": [3, 8], "violat": [3, 5, 8], "intens": [3, 5, 6, 9], "demand": [3, 4, 5, 6, 7, 8, 9], "especi": [3, 5, 6, 7, 8, 9], "dong": [3, 5, 8], "There": [3, 5, 6, 7, 8, 9], "rlaif": [3, 8], "give": [3, 5, 6, 8], "rise": [3, 6, 8], "kim": [3, 5, 8], "meta": [3, 4, 5, 7, 8], "wu": [3, 5, 6, 8, 9], "scheme": [3, 4, 7], "inspir": [3, 8], "schema": [3, 9], "row": [3, 5, 6, 8], "match": [3, 4, 5, 6, 7, 8, 9], "boundari": [3, 4, 5, 6, 8], "craft": [3, 4, 5, 8, 9], "elicit": [3, 6, 8, 9], "unalign": 3, "panda": [3, 5, 6, 8], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 8], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 7], "class": [3, 5, 6, 8, 9], "userpromptgener": 3, "pd": [3, 5, 6, 8], "pydant": [3, 5, 6, 8, 9], "basemodel": [3, 5, 6, 8, 9], "time": [3, 4, 5, 6, 7, 8, 9], "type": [3, 4, 5, 6, 7, 8, 9], "dotenv": [3, 5, 6, 8, 9], "load_dotenv": [3, 5, 6, 8, 9], "environ": [3, 4, 5, 6, 7, 8, 9], "variabl": [3, 5, 6, 8, 9], "overrid": [3, 6, 8, 9], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 6, 8, 9], "__init__": [3, 6, 8, 9], "4o": [3, 5, 6, 7, 8, 9], "mini": [3, 5, 6, 7, 8, 9], "client": [3, 5, 6, 7, 8, 9], "_generate_prompt": 3, "batch": [3, 4, 5, 6, 7], "system_prompt": [3, 8], "chat": [3, 5, 6, 7, 8, 9], "pars": [3, 5, 8, 9], "messag": [3, 4, 5, 6, 7, 8, 9], "response_format": [3, 5, 6, 8, 9], "except": [3, 5, 8, 9], "generate_prompt": 3, "num_prompt": [3, 7], "int": [3, 5, 6, 8], "save_to_csv": 3, "least": [3, 5, 8], "multipl": [3, 4, 5, 6, 7, 8, 9], "arg": [3, 5, 6, 8, 9], "option": [3, 4, 5, 6, 7, 8, 9], "filepath": 3, "save": [3, 4, 5, 6, 7, 8], "datafram": [3, 5, 6, 8], "all_prompt": 3, "sleep": 3, "enclos": [3, 8, 9], "quot": [3, 4, 5, 7], "startswith": [3, 8], "els": [3, 5, 6, 8], "df": [3, 5, 8], "to_csv": [3, 8], "index": [3, 5, 6, 7, 8, 9], "fals": [3, 5, 6, 7, 8, 9], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 8], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8, 9], "secur": [3, 4, 5, 6, 8, 9], "breach": [3, 5, 8], "manipul": [3, 5, 7, 8, 9], "10": [3, 5, 6, 7, 8, 9], "to_markdown": [3, 8], "me": [3, 6, 8, 9], "hurt": 3, "someon": 3, "caught": [3, 8], "plan": [3, 4, 5, 7, 9], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 8], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 8], "8": [3, 4, 5, 6, 7, 8], "teach": [3, 9], "my": [3, 7, 8, 9], "monei": [3, 5], "video": [3, 4, 5, 7, 8], "game": [3, 4, 5, 6, 7], "9": [3, 4, 5, 6, 7, 8], "skip": [3, 8, 9], "troubl": [3, 8], "responsegener": 3, "properli": [3, 5, 9], "hug": [3, 4, 5, 7, 8], "instanti": [3, 5, 6], "otherwis": [3, 5, 8], "connect": [3, 4, 5, 6, 7, 9], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 6, 9], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 7], "enhanc": [3, 4, 5, 6, 7, 8, 9], "visit": [3, 5], "ui": [3, 5, 9], "click": [3, 7], "select": [3, 4, 5, 6, 7, 9], "choos": [3, 4, 5, 6], "cpu": [3, 4, 7], "gpu": [3, 4, 7], "configur": [3, 4, 5, 6, 7, 8], "meaning": [3, 5, 6, 9], "region": [3, 5, 6], "closest": [3, 5, 7], "your": [3, 4, 5, 6, 8, 9], "locat": [3, 5, 6, 7, 8], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7, 8], "generate_respons": [3, 5, 9], "prompts_df": 3, "remov": [3, 5, 7], "strip": [3, 5, 9], "elif": [3, 6], "chat_complet": 3, "max_token": [3, 5], "seed": [3, 8], "42": [3, 4, 5, 7, 8], "append": [3, 5, 6, 8, 9], "results_df": [3, 8], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 8], "iloc": [3, 6], "tolist": [3, 8], "parallelevalu": 3, "taming_util": [3, 4, 8], "modul": [3, 5, 6, 9], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 7, 9], "gladli": 3, "constitut": [3, 5, 6], "would": [3, 5, 6, 7, 8, 9], "dtype": [3, 5, 6, 8], "80": [3, 5], "absolut": [3, 4, 5, 9], "materi": [3, 5, 6, 7, 8], "plastic": 3, "food": 3, "hold": [3, 5], "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": [3, 6], "place": [3, 5, 7, 8, 9], "insid": [3, 5, 8], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 6, 7, 8, 9], "demolit": 3, "scissor": 3, "length": [3, 5, 6, 7, 9], "smash": 3, "smooth": [3, 6, 7], "arrang": [3, 5], "c": [3, 4, 5, 7, 9], "shape": [3, 6, 8, 9], "top": [3, 5, 6, 7, 9], "tuck": 3, "catch": [3, 8], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 8], "edg": [3, 4, 5, 7, 8], "separ": [3, 5, 6, 7, 8], "process_aligned_respons": 3, "strictli": [3, 9], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 6, 9], "enforc": [3, 5, 8, 9], "dictionari": [3, 5, 8, 9], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 9], "processor": [3, 4, 7, 9], "api_kei": [3, 5, 6, 8], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8, 9], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8, 9], "json": [3, 5, 6, 7, 8], "quote_al": 3, "fall": [3, 5, 7, 8], "deem": [3, 5, 8], "pertain": [3, 5, 6], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 7], "repo_id": [3, 7], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 6, 8], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7, 8], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 7], "axi": [3, 5], "drop": [3, 4, 5, 6, 8], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6, 7], "parquet": 3, "arrow": 3, "00": [3, 5, 6, 7], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6, 7], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8, 9], "minut": [3, 6], "torch": [3, 9], "h4": [3, 8], "honest": [3, 5], "ultrafeedback": [3, 8], "binar": [3, 8], "lib": [3, 8, 9], "ultrafeedback_binar": [3, 8], "honesti": [3, 8], "dimens": [3, 5, 6, 7, 8], "blend": [3, 7], "automodelforcausallm": [3, 9], "autotoken": [3, 9], "load_dataset": [3, 7, 8], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7, 8], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 8], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 9], "is_avail": 3, "mp": 3, "from_pretrain": [3, 7, 9], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 9], "float32": [3, 6], "config": [3, 5, 7, 8], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 7], "learning_r": [3, 7], "determin": [3, 4, 5, 7, 8, 9], "aggress": [3, 5, 7, 8], "1e": 3, "huyen": 3, "cosin": [3, 6], "lr_scheduler_typ": 3, "stabil": [3, 5, 6, 8], "gradual": 3, "decreas": [3, 4, 5, 6, 9], "accumul": [3, 5], "v": [3, 9], "16": [3, 4, 5, 6, 7, 8], "per_device_train_batch_s": 3, "simul": [3, 5, 8, 9], "gradient_accumulation_step": 3, "strongli": [3, 9], "lower": [3, 4, 5, 6, 7, 8, 9], "conserv": [3, 8], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 7, 8], "suffic": [3, 6], "20": [3, 5, 6, 7, 8, 9], "warmup_step": 3, "stop": [3, 4, 5, 7], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 7, 8, 9], "200": [3, 4, 5, 7, 8], "50": [3, 5, 6, 7, 8, 9], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 9], "pathlib": [3, 6, 8], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 7, 9], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8, 9], "max_prompt_length": [3, 7], "1024": 3, "max_length": [3, 5, 6, 9], "1536": 3, "red": [3, 6], "averag": [3, 4, 5, 6, 7, 9], "visual": [3, 4, 5, 6, 7, 8], "quick": [3, 5, 6, 7, 8], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8, 9], "obviou": 3, "suffici": [3, 5, 6, 9], "save_model": 3, "hf_token": 3, "tag": [3, 8, 9], "congratul": 3, "successfulli": [3, 5, 6, 8, 9], "card": [3, 5, 8], "newli": [3, 5], "qualit": [3, 5, 8], "assess": [3, 4, 5, 6, 7, 8], "rigor": [3, 5, 7, 8], "quantit": [3, 5, 6], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 7], "aligned_output": 3, "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 7, 8, 9], "regul": [3, 4, 5, 6, 7, 8], "law": [3, 4, 5, 6, 7, 8], "degre": [3, 5, 6, 9], "mishandl": 3, "countri": [3, 5, 6], "seriou": [3, 5, 8], "imprison": 3, "death": [3, 6], "variou": [3, 4, 5, 6, 7, 8], "nation": [3, 8], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": [3, 6], "regulatori": [3, 4, 5, 6, 7, 8], "anecdot": [3, 8], "systemat": [3, 4, 5, 6, 7, 8, 9], "quantifi": [3, 5, 7, 8], "f1": [3, 5, 8], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8, 9], "addition": [3, 4, 5, 7, 8], "vari": [3, 4, 5, 6, 7, 8, 9], "interpret": [3, 5, 6, 7, 8], "judg": [3, 5, 6], "summar": [3, 5, 6, 7], "three": [3, 5, 6, 7, 8], "togeth": [3, 6, 7, 8], "entri": [3, 5, 7], "somewhat": [3, 6], "databas": [3, 4, 5, 9], "distribut": [3, 4, 5, 7, 8, 9], "static": [3, 8, 9], "k12": [3, 8], "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 8], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 8], "lambda": [3, 8], "prompts_ev": 3, "to_list": 3, "chunk": [3, 7], "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 7, 8, 9], "minimum": [3, 4, 5, 7], "min_response_length": 3, "filter": [3, 5, 6, 7, 9], "string": [3, 5, 6, 8, 9], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 8], "punish": 3, "unit": [3, 5, 6, 8, 9], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 8], "respond": [3, 4, 5, 6, 8, 9], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 8], "safetyscor": [3, 8], "float": [3, 4, 5, 6, 7, 8, 9], "valueerror": [3, 9], "empti": [3, 9], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 9], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 6, 7], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 7, 8], "evals_df_result": 3, "h": [3, 5, 6, 7, 8], "identifi": [3, 4, 5, 6, 7, 8, 9], "requ": 3, "statist": [3, 5, 8], "naiv": [3, 6, 9], "score_map": 3, "count": [3, 5, 6, 7, 8], "percentag": [3, 4, 5, 8], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 8], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8, 9], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7, 8], "281": [3, 5], "83": [3, 4, 5, 8], "14": [3, 5, 6, 7, 8, 9], "43": [3, 5, 6, 7, 8], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 8], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 7, 8, 9], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 8], "wild": [3, 7], "consider": [3, 4, 6, 7, 8, 9], "proof": [3, 4], "taken": [3, 5, 6, 7, 8, 9], "huang": [3, 5, 7, 8], "overal": [3, 5, 6, 7, 8, 9], "annot": [3, 5, 6, 7, 8], "mirror": [3, 5, 8], "inaccur": [3, 5, 6, 8, 9], "consecut": [3, 8], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 8, 9], "yin": [3, 5, 8], "resembl": 3, "declin": [3, 4, 5, 6], "volatil": [3, 5, 6], "ineffici": [3, 4, 5, 6], "smollm": 3, "rel": [3, 4, 5, 6, 7, 8], "term": [3, 4, 5, 6, 7, 8], "trade": [3, 4, 5, 6, 7, 8, 9], "weigh": 3, "qwen": [3, 7, 9], "remark": [3, 4, 7, 8, 9], "rival": [3, 7], "ultim": [3, 4, 5, 6, 7, 8], "threshold": [3, 4, 5, 7, 8], "chen": [3, 5, 6, 7, 8, 9], "overli": [3, 5, 8, 9], "simpli": [3, 4, 5, 6, 7, 9], "neglect": [3, 5, 8], "themselv": [3, 5], "complementari": 3, "throughput": [3, 4, 7], "screen": [3, 5, 8], "flag": [3, 5, 7, 8], "preliminari": [3, 5], "judgment": [3, 5, 6], "valid": [3, 4, 5, 7, 9], "automat": [3, 5, 7, 8], "advis": 3, "composit": [3, 5], "plai": [3, 5, 6, 7, 8, 9], "led": [3, 5, 9], "apologet": 3, "hesit": 3, "benign": [3, 8], "apolog": 3, "inde": [3, 6], "accordingli": [3, 5, 8], "perhap": [3, 4, 9], "creation": [3, 6, 7, 8], "invalu": 3, "hyperparamet": [3, 7, 8], "mention": [3, 5, 6, 8, 9], "optimist": 3, "memor": [3, 5], "generaliz": 3, "abc": [3, 8], "4a": 3, "amanda": [3, 5, 8], "jan": [3, 5, 8], "brauner": [3, 8], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 8], "cullen": [3, 8], "david": [3, 5, 6, 7, 8], "duvenaud": 3, "richard": [3, 5, 8], "ngo": [3, 8], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 8], "olsson": [3, 8], "sam": [3, 5, 8], "ringer": 3, "liam": [3, 5, 8], "skirvin": 3, "jess": [3, 5, 8], "smith": [3, 5, 7], "dawn": [3, 5, 8], "song": [3, 4, 5, 8, 9], "william": [3, 4, 5, 6, 7, 8], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 6, 8], "983c85a201a962f": 3, "pdf": [3, 6, 7, 8], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 6, 8], "yuntao": [3, 5, 8], "andi": [3, 5, 8], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 8], "nova": [3, 7], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 8], "ganguli": [3, 5, 8], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 8], "saurav": [3, 8], "kadavath": 3, "jackson": [3, 5, 8], "kernion": [3, 5, 8], "conerli": 3, "sheer": [3, 9], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 8], "hernandez": [3, 5, 8], "tristan": 3, "hume": 3, "scott": [3, 5, 8], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 8], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 8], "olah": 3, "ben": [3, 5, 7, 8], "mann": [3, 8], "jare": [3, 5, 8], "kaplan": [3, 5, 8], "arxiv": [3, 4, 5, 6, 7, 8, 9], "org": [3, 4, 5, 6, 7, 8, 9], "ab": [3, 4, 5, 6, 7, 8, 9], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 8, 9], "mckinnon": 3, "carol": [3, 8], "christoph": [3, 5, 8], "dustin": 3, "eli": [3, 5, 7, 8], "tran": [3, 9], "johnson": 3, "ethan": [3, 5, 6, 8], "perez": [3, 6, 8], "jami": [3, 8], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 8], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8, 9], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 7], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 7], "telleen": 3, "lawton": 3, "samuel": [3, 5, 8], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8, 9], "ccl": [3, 8], "24": [3, 4, 5, 6, 7, 8, 9], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8, 9], "jiang": [3, 5, 6, 8], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8, 9], "judgement": [3, 5, 8], "2402": [3, 8], "10669": 3, "dphz23": 3, "tim": [3, 6, 8], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 8], "holtzman": [3, 5], "luke": [3, 5, 8], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 6, 7, 8], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7, 8], "boost": 3, "2410": [3, 4, 8], "06961": 3, "fqh": 3, "duanyu": 3, "bowen": [3, 5, 7, 8], "qin": [3, 5, 7, 8], "zheng": [3, 5, 6, 7, 8], "wenqiang": 3, "lei": [3, 5, 7, 8], "analyz": [3, 4, 5, 6, 7, 8, 9], "perspect": [3, 6, 8], "2404": [3, 5, 8], "04626": 3, "h44a": 3, "binari": [3, 5, 7, 8], "huggingfaceh4": [3, 7, 8], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 8], "tao": [3, 5, 8], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 8], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7, 8], "tang": [3, 5, 7, 8], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 8], "lee": [3, 5, 6, 7, 8, 9], "jame": [3, 5, 8], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 9], "zhengxiao": 3, "aohan": 3, "zeng": [3, 8], "xiao": [3, 8], "minli": 3, "hongn": 3, "jie": [3, 5, 8, 9], "yuxiao": 3, "2412": [3, 5, 6, 7, 8], "06000": 3, "hsw": 3, "21": [3, 5, 6, 7], "edward": [3, 5], "j": [3, 5, 7, 8, 9], "yelong": 3, "shen": [3, 5, 8], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 7, 8], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 7, 8], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 8], "shane": [3, 5, 8], "gu": [3, 5, 8], "le": [3, 5, 6, 7], "yuexin": 3, "xuezhi": [3, 6], "hongkun": 3, "yu": [3, 5, 7, 8], "jiawei": [3, 9], "2210": [3, 8], "11610": 3, "hug24": [3, 5], "hug4c": 3, "hug4d": [3, 7], "doc": [3, 4, 5, 6, 7, 8, 9], "en": [3, 5, 6, 7, 8, 9], "huy24": 3, "chip": 3, "reilli": [3, 6], "media": [3, 4, 5, 8], "decemb": [3, 5, 6, 8], "9781098129095": 3, "www": [3, 5, 6, 7, 8], "oreilli": [3, 6], "ksd": 3, "rylan": [3, 5, 8], "schaeffer": [3, 8], "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": [3, 8], "koyejo": [3, 8], "thrive": [3, 5, 9], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 7], "yue": [3, 6], "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 8], "lawrenc": 3, "sean": [3, 5, 8], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 7], "2407": [3, 5, 6, 7, 8], "21783": [3, 7], "lwx": 3, "lin": [3, 5, 6, 7, 8, 9], "rui": [3, 5, 7, 9], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 7, 8], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 7, 8], "survei": [3, 5, 8, 9], "2406": [3, 5, 6, 7, 8], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 8], "xu": [3, 5, 7, 8], "diogo": [3, 8], "almeida": [3, 8], "carrol": [3, 8], "wainwright": [3, 8], "pamela": [3, 5, 8], "mishkin": [3, 5, 8], "chong": [3, 8], "sandhini": [3, 8], "agarw": [3, 5, 8], "katarina": [3, 8], "slama": [3, 8], "alex": [3, 5, 7, 8], "rai": [3, 5, 7, 8], "john": [3, 5, 6, 8], "hilton": [3, 5, 7, 8], "fraser": [3, 8], "kelton": 3, "miller": [3, 5], "maddi": [3, 8], "simen": [3, 8], "peter": [3, 5, 7, 8], "welind": [3, 5, 8], "paul": [3, 5, 8], "christiano": [3, 8], "leik": [3, 5, 8], "ryan": [3, 5, 8], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 8], "eric": [3, 5, 7, 8], "mitchel": [3, 6, 7], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 6, 8], "chelsea": [3, 8], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 6, 7, 8], "filip": [3, 8], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 8], "radford": [3, 5, 8], "oleg": [3, 8], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 6, 9], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 8], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 7], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": [3, 6], "09539": 3, "tm": [3, 7], "23": [3, 5, 6, 7, 8], "hugo": [3, 7], "loui": [3, 5, 7], "martin": [3, 5, 6, 7, 8], "kevin": [3, 5, 7, 8], "stone": [3, 7], "albert": [3, 7], "amjad": [3, 7], "almahairi": [3, 7], "yasmin": [3, 7], "babaei": [3, 7], "nikolai": [3, 7], "bashlykov": [3, 7], "soumya": [3, 7], "batra": [3, 7], "prajjwal": [3, 7], "bhargava": [3, 7], "shruti": [3, 7], "bhosal": [3, 7], "dan": [3, 5, 7, 8, 9], "bikel": [3, 7], "luka": [3, 7], "blecher": [3, 7], "cristian": [3, 7], "canton": [3, 7], "ferrer": [3, 7], "moya": [3, 7], "guillem": [3, 7], "cucurul": [3, 7], "esiobu": [3, 7], "jude": [3, 7], "fernand": [3, 7], "jeremi": [3, 5, 6, 7], "fu": [3, 6, 7], "wenyin": [3, 7], "brian": [3, 6, 7, 8], "fuller": [3, 7, 8], "cynthia": [3, 7], "gao": [3, 5, 7, 8], "vedanuj": [3, 7], "goswami": [3, 7, 8], "naman": [3, 6, 7], "goyal": [3, 6, 7], "anthoni": [3, 6, 7], "hartshorn": [3, 7], "saghar": [3, 7], "hosseini": [3, 7], "hakan": [3, 7, 8], "inan": [3, 7, 8], "marcin": [3, 7], "karda": [3, 7], "viktor": [3, 7], "kerkez": [3, 7], "madian": [3, 7, 8], "khabsa": [3, 7, 8], "isabel": [3, 7, 8], "kloumann": [3, 7], "artem": [3, 7], "korenev": [3, 7], "punit": [3, 7], "singh": [3, 5, 6, 7], "koura": [3, 7], "mari": [3, 5, 7, 8], "ann": [3, 7, 8], "lachaux": [3, 7], "thibaut": [3, 7], "lavril": [3, 7], "jenya": [3, 7], "diana": [3, 5, 7], "liskovich": [3, 7], "yinghai": [3, 7], "yune": [3, 7, 8], "mao": [3, 4, 7, 8], "xavier": [3, 7], "martinet": [3, 7], "todor": [3, 7, 8], "mihaylov": [3, 7], "pushkar": [3, 7], "mishra": [3, 5, 7], "igor": [3, 5, 7, 8], "molybog": [3, 7], "yixin": [3, 5, 7], "nie": [3, 5, 6, 7], "andrew": [3, 5, 6, 7, 8], "poulton": [3, 7], "reizenstein": [3, 7], "rashi": [3, 7, 8], "rungta": [3, 6, 7, 8], "kalyan": [3, 7], "saladi": [3, 7], "alan": [3, 7, 8], "schelten": [3, 7], "ruan": [3, 7], "silva": [3, 7], "ranjan": [3, 7], "subramanian": [3, 7], "xiaoq": [3, 7], "ellen": [3, 7], "tan": [3, 5, 6, 7], "binh": [3, 7], "ross": [3, 4, 7, 8], "taylor": [3, 7], "adina": [3, 7, 8], "jian": [3, 5, 6, 7], "kuan": [3, 7], "puxin": [3, 7], "yan": [3, 4, 5, 7], "iliyan": [3, 7], "zarov": [3, 7], "yuchen": [3, 5, 7, 8], "angela": [3, 5, 7, 8], "fan": [3, 5, 6, 7], "melani": [3, 7], "kambadur": [3, 7], "sharan": [3, 7], "narang": [3, 7], "aurelien": [3, 7], "rodriguez": [3, 7], "stojnic": [3, 7], "sergei": [3, 7], "edunov": [3, 7], "thoma": [3, 5, 7, 8], "scialom": [3, 7], "2307": [3, 7, 9], "09288": [3, 7], "vaa": [3, 8], "berti": [3, 8], "adarsh": [3, 8], "agraw": [3, 8], "ahm": [3, 8], "victor": [3, 8], "akinwand": [3, 8], "namir": [3, 8], "nuaimi": [3, 8], "najla": [3, 8], "alfaraj": [3, 8], "alhajjar": [3, 8], "aroyo": [3, 8], "trupti": [3, 8], "bavalatti": [3, 8], "max": [3, 5, 6, 8], "bartolo": [3, 8], "borhan": [3, 8], "blili": [3, 8], "hamelin": [3, 8], "kurt": [3, 8], "bollack": [3, 8], "rishi": [3, 5, 7, 8], "bomassani": [3, 8], "marisa": [3, 8], "ferrara": [3, 8], "boston": [3, 8], "sim\u00e9on": [3, 8], "campo": [3, 8], "kal": [3, 8], "chakra": [3, 8], "canyu": [3, 8], "codi": [3, 8], "coleman": [3, 8], "zachari": [3, 5, 8], "delpierr": [3, 8], "coudert": [3, 8], "leon": [3, 8], "derczynski": [3, 8], "debojyoti": [3, 8], "dutta": [3, 8], "ian": [3, 5, 8], "eisenberg": [3, 8], "ezick": [3, 8], "heather": [3, 8], "frase": [3, 8], "ram": [3, 7, 8], "gandikota": [3, 8], "agasthya": [3, 8], "gangavarapu": [3, 8], "ananya": [3, 5, 8], "geali": [3, 8], "rajat": [3, 8], "ghosh": [3, 5, 8], "goel": [3, 5, 8], "usman": [3, 8], "gohar": [3, 8], "sujata": [3, 8], "hale": [3, 8], "wiebk": [3, 8], "hutiri": [3, 8], "marvin": [3, 8], "imperi": [3, 8], "surgan": [3, 8], "jandial": [3, 8], "nick": [3, 5, 8], "judd": [3, 8], "felix": [3, 5, 8], "juefei": [3, 8], "fouts": [3, 8], "khomh": [3, 8], "bhavya": [3, 8], "kailkhura": [3, 8], "hannah": [3, 5, 8], "rose": [3, 8], "kirk": [3, 8], "klyman": [3, 8], "knotz": [3, 8], "kuchnik": [3, 8], "shachi": [3, 8], "kumar": [3, 5, 8], "srijan": [3, 8], "lengerich": [3, 8], "bo": [3, 5, 7, 8], "zeyi": [3, 8], "liao": [3, 5, 8], "eileen": [3, 8], "sarah": [3, 5, 8], "luger": [3, 8], "yifan": [3, 5, 8], "priyanka": [3, 8], "mammen": [3, 8], "kelvin": [3, 6, 8], "manyeki": [3, 8], "mcgregor": [3, 8], "virendra": [3, 8], "mehta": [3, 5, 8], "shafe": [3, 8], "moham": [3, 8], "moss": [3, 8], "lama": [3, 8], "nachman": [3, 8], "dinesh": [3, 8], "jinenh": [3, 8], "naganna": [3, 8], "amin": [3, 8], "nikanjam": [3, 8], "besmira": [3, 8], "nushi": [3, 8], "lui": [3, 5, 8], "oala": [3, 8], "iftach": [3, 8], "orr": [3, 5, 8], "alicia": [3, 5, 8], "parrish": [3, 5, 8], "cigdem": [3, 8], "patlak": [3, 8], "pietri": [3, 8], "forough": [3, 8], "poursabzi": [3, 8], "sangdeh": [3, 8], "eleonora": [3, 8], "presani": [3, 8], "fabrizio": [3, 8], "puletti": [3, 8], "r\u00f6ttger": [3, 8], "sahai": [3, 8], "santo": [3, 8], "nino": [3, 8], "scherrer": [3, 8], "alic": [3, 5, 8, 9], "schoenauer": [3, 8], "sebag": [3, 8], "patrick": [3, 6, 8], "schramowski": [3, 8], "abolfazl": [3, 8], "shahbazi": [3, 8], "vin": [3, 8], "xudong": [3, 5, 6, 8], "vamsi": [3, 8], "sistla": [3, 8], "leonard": [3, 8], "testuggin": [3, 8], "vithursan": [3, 8], "thangarasa": [3, 8], "elizabeth": [3, 5, 8], "watkin": [3, 8], "rebecca": [3, 5, 8], "weiss": [3, 8], "welti": [3, 8], "tyler": [3, 5, 8], "wilber": [3, 8], "jean": [3, 8], "poonam": [3, 8], "yadav": [3, 8], "xianjun": [3, 8], "yang": [3, 5, 6, 7, 8, 9], "yi": [3, 5, 6, 8, 9], "wenhui": [3, 8], "fedor": [3, 8], "zhdanov": [3, 8], "jiacheng": [3, 5, 8], "perci": [3, 5, 8], "liang": [3, 5, 8, 9], "mattson": [3, 8], "joaquin": [3, 8], "vanschoren": [3, 8], "v0": [3, 8], "12241": [3, 8], "wyg": 3, "tianhao": [3, 5, 7, 8], "weizh": 3, "yuan": [3, 5, 8], "olga": 3, "golovneva": 3, "jing": [3, 8], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 6, 8], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": [3, 6], "xie": [3, 5, 8], "mingyuan": 3, "paradigm": [3, 5, 6], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 9], "econom": [4, 5, 6], "fuel": 4, "equival": [4, 5, 7], "consumpt": [4, 5, 6], "contrari": 4, "truth": [4, 5, 7, 8, 9], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 7, 8], "60": [4, 5, 6, 7, 8], "06": [4, 5, 6, 9], "price": [4, 5, 6, 7], "fallen": 4, "62": [4, 5, 7], "introduct": 4, "march": [4, 5, 9], "stem": [4, 5, 9], "compound": 4, "bit": [4, 6, 7], "tune": [4, 5, 6, 8], "dpo": [4, 7], "competit": [4, 5, 6, 7, 8], "plummet": 4, "rapidli": [4, 6, 7, 8, 9], "preciou": 4, "wouldn": [4, 5], "sens": [4, 8], "wait": [4, 5, 8], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 9], "coal": 4, "industri": [4, 5, 6, 7, 8, 9], "made": [4, 5, 6, 7, 9], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5, 6], "repeat": [4, 6], "didn": [4, 9], "smartphon": [4, 5, 6, 7], "server": [4, 5, 7, 9], "network": [4, 5, 6, 7, 9], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5, 6], "hd": 4, "stream": [4, 5, 6, 7, 9], "storag": [4, 5, 6, 7, 8], "gigabyt": 4, "entir": [4, 5, 6, 7, 9], "massiv": [4, 5, 6, 8], "broadli": [4, 6, 7, 9], "audio": [4, 5, 6], "transcript": [4, 6], "multimod": [4, 7, 8], "imag": [4, 5, 6, 7, 8], "exponenti": [4, 5], "growth": [4, 5, 6], "magnifi": 4, "everyth": [4, 9], "billion": [4, 5, 6, 7, 9], "dollar": [4, 5, 7], "annual": [4, 5, 6, 8], "millisecond": [4, 5], "latenc": [4, 5, 6, 7, 8], "30": [4, 5, 6, 7, 8], "mobil": [4, 5, 7, 9], "b": [4, 5, 6, 7, 8, 9], "tradeoff": [4, 7, 8, 9], "pro": [4, 5, 6, 7, 8], "trigger": [4, 6, 8], "premium": [4, 5], "innov": [4, 5, 6, 7, 8], "capac": [4, 5, 6, 7], "link": [4, 5], "dual": 4, "character": [4, 5, 8], "ahead": [4, 7, 8], "decai": [4, 7], "area": [4, 5, 6, 8, 9], "flash": [4, 6, 7], "cach": [4, 5, 6, 7], "prompt": [4, 5, 6, 8], "compress": [4, 5, 6, 7], "provis": [4, 5], "extent": [4, 5, 8], "problema": 4, "accomplish": [4, 6, 8, 9], "accompani": [4, 5, 8], "transact": [4, 5, 8], "roi": 4, "alloc": [4, 5, 6, 7, 8], "budget": [4, 6, 7], "viabil": [4, 7], "prioriti": [4, 5, 7], "overlook": [4, 6], "thorough": [4, 7, 8], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8, 9], "longev": 4, "accommod": 4, "evalu": [4, 6, 7, 9], "multi": [4, 5, 6, 7, 8, 9], "baselin": [4, 5, 7, 8], "met": [4, 5, 8], "equal": [4, 5, 6, 8], "concurr": [4, 7], "peak": 4, "spike": 4, "versu": [4, 5, 7, 8], "volum": [4, 5, 7, 8], "season": [4, 5], "variat": [4, 5, 7, 8], "uptim": 4, "mainten": [4, 5, 7, 8], "disrupt": [4, 5, 6], "backup": 4, "failov": 4, "clearli": [4, 5, 8, 9], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 8, 9], "event": [4, 5], "seamless": [4, 5, 8], "broader": [4, 5, 6, 7, 8], "vector": [4, 7, 8], "augment": [4, 5, 7], "rag": [4, 7], "retent": [4, 5, 6], "polici": [4, 5, 6, 7], "essenti": [4, 5, 6, 7, 8, 9], "opportun": [4, 5, 6], "post": [4, 5, 7, 8], "32": [4, 5, 6, 7], "fp32": 4, "fp16": [4, 7], "proport": [4, 5, 7], "byte": 4, "120": [4, 5, 8], "gb": 4, "whole": [4, 5], "done": [4, 5, 7, 8, 9], "smollm2": [4, 5, 7, 9], "135m": [4, 7], "load_gguf": 4, "bartowski": 4, "gguf": [4, 7], "gguf_file_q2_k": 4, "q2_k": [4, 7], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 6, 7, 9], "proxi": [4, 5, 6, 8], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 6, 9], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5, 6], "arrai": [4, 6, 8], "detach": 4, "graph": [4, 5, 6], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 9], "9970": 4, "exemplifi": [4, 6, 7, 8], "70b": [4, 5, 7], "unsloth": 4, "141": 4, "q8_0": [4, 7], "75": [4, 8], "47": [4, 5, 7, 8], "cumul": [4, 5, 6], "26": [4, 5, 7], "19": [4, 5, 6, 7, 8], "space": [4, 5, 6, 7, 8], "counterpart": 4, "spectrum": [4, 5, 6], "variant": [4, 5, 7, 8], "laptop": [4, 5], "desktop": [4, 5, 7], "enterpris": [4, 5, 6, 7, 8, 9], "ceil": 4, "notabl": [4, 5, 6, 8, 9], "bitnet": 4, "cpp": [4, 9], "arm": 4, "x86": 4, "speedup": [4, 7], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 6, 8], "raw": [4, 5, 7, 8, 9], "speed": [4, 5, 6, 7, 8], "energi": [4, 5, 6], "55": [4, 5, 6, 7], "70": [4, 5, 7], "71": [4, 5], "82": [4, 8], "impress": [4, 7, 9], "100b": 4, "b1": 4, "58": [4, 6, 7], "pace": [4, 5, 6, 8], "kernel": 4, "characterist": [4, 5, 7, 8, 9], "excit": [4, 7], "frontier": [4, 8], "compel": [4, 5, 7, 9], "acceler": [4, 5, 7, 8], "faster": [4, 6, 7], "arithmet": [4, 5], "benefici": [4, 5, 7], "sustain": [4, 5, 6, 7, 8], "Be": [4, 5, 7, 8], "fine": [4, 5, 6, 8], "pure": [4, 5, 7, 9], "unlock": [4, 9], "track": [4, 5, 6, 8], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 8], "shaoguang": 4, "shume": [4, 8], "ma": [4, 5, 8], "hongyu": [4, 5], "xia": [4, 5, 6, 7], "infra": 4, "fast": [4, 5, 6, 7, 8, 9], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 7], "2024w": [4, 7], "unsloth24": 4, "jonathan": [4, 5, 8], "ceo": [4, 5], "groq": [4, 7], "maarten": [4, 5, 6, 8], "grootendorst": [4, 6], "streamlin": [4, 5, 6, 7, 9], "notat": 4, "width": [4, 7], "_k": 4, "_0": 4, "matter": [5, 6], "beauti": 5, "smart": [5, 8], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "pivot": [5, 7], "verif": [5, 6, 7, 9], "norm": 5, "realm": 5, "convent": [5, 8], "evolut": [5, 7], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": [5, 6], "ignor": 5, "outdat": [5, 6, 8, 9], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 8], "mindset": 5, "front": [5, 7], "incorpor": [5, 6, 7, 8, 9], "produc": [5, 6, 7, 8, 9], "novel": [5, 7], "ident": [5, 6], "isn": [5, 8], "bug": 5, "random": [5, 8, 9], "testabl": 5, "guarante": [5, 6, 7, 8, 9], "exceedingli": 5, "primari": [5, 6, 8], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8, 9], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 8], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6, 7], "temp_respons": 5, "iterrow": [5, 8], "10000": [5, 6, 9], "appl": [5, 6, 9], "txt": [5, 6, 7, 9], "sec_fil": [5, 9], "nsecur": 5, "AND": [5, 9], "exchang": [5, 6, 8, 9], "commiss": [5, 6, 8, 9], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 8], "13": [5, 6, 7, 8], "OR": 5, "OF": [5, 8], "THE": [5, 8], "1934": 5, "nfor": 5, "fiscal": [5, 6], "septemb": [5, 6], "28": [5, 6, 7, 8], "nor": [5, 6], "period": [5, 6, 8], "ncommiss": 5, "001": [5, 7], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 7, 8], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 8, 9], "n95014": 5, "princip": 5, "offic": [5, 6, 8], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 9], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 6, 9], "llc": [5, 9], "n0": 5, "000": [5, 7, 9], "note": [5, 7, 9], "2025": [5, 6], "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 8], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 9], "shorter": [5, 6], "past": [5, 6, 8], "90": [5, 6, 7, 8], "submit": [5, 7, 8], "electron": 5, "232": 5, "filer": 5, "12b": [5, 8], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 8], "revis": [5, 8], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 8], "prepar": [5, 7, 8], "correct": [5, 6, 8], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 8, 9], "vote": 5, "held": [5, 9], "affili": [5, 9], "29": [5, 7, 8, 9], "last": [5, 6, 8, 9], "quarter": 5, "628": [5, 9], "553": [5, 9], "sole": [5, 6, 8], "disclosur": [5, 6, 7, 8], "director": [5, 7, 8], "date": 5, "exclud": 5, "n15": 5, "115": [5, 9], "823": [5, 9], "outstand": [5, 9], "octob": [5, 9], "18": [5, 6, 7, 8, 9], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 6, 8, 9], "sharehold": [5, 6], "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 7, 8], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 6, 8], "nmine": 5, "ii": [5, 7, 9], "nmarket": 5, "stockhold": 5, "purchas": [5, 6, 8], "n19": 5, "reserv": [5, 6], "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 6, 8], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": [5, 6], "ndirector": 5, "corpor": [5, 6, 8], "nexecut": 5, "ownership": [5, 7], "certain": [5, 6, 8, 9], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": [5, 6], "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6, 7], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7, 8], "macroeconom": [5, 6], "anticip": [5, 6, 8], "intend": [5, 7, 8], "caus": [5, 8, 9], "oblig": [5, 6], "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 7], "ncompani": 5, "manufactur": 5, "tablet": [5, 6, 7], "wearabl": 5, "accessori": 5, "sell": [5, 8], "varieti": [5, 7], "52": [5, 8], "53": [5, 6, 8], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 6, 8, 9], "iphon": [5, 6], "se": [5, 8], "nmac": 5, "maco": [5, 7], "mac": [5, 7], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 6, 9], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 7], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 9], "naccessori": 5, "brand": 5, "third": [5, 6, 7, 8], "parti": [5, 6, 7, 8], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": [5, 6], "napplecar": 5, "portfolio": [5, 6], "applecar": 5, "repair": 5, "coverag": [5, 6, 8], "accident": 5, "damag": [5, 8], "theft": [5, 8], "ncloud": 5, "ndigit": 5, "app": [5, 6, 7], "discov": [5, 7, 8], "download": [5, 6, 7], "music": 5, "podcast": 5, "subscript": [5, 7], "arcad": 5, "sm": 5, "listen": [5, 7], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": [5, 6], "pai": [5, 7], "cashless": 5, "nsegment": 5, "primarili": [5, 6, 8], "geograph": [5, 6, 8], "basi": [5, 7], "segment": [5, 6, 8, 9], "america": [5, 6], "europ": 5, "china": [5, 6, 7, 8], "japan": 5, "rest": [5, 7], "asia": 5, "pacif": 5, "north": [5, 8], "south": 5, "european": [5, 8], "india": 5, "middl": [5, 6, 7, 8], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": [5, 6], "although": [5, 7], "partner": [5, 6, 7, 8], "mid": [5, 6], "resel": [5, 6], "retail": 5, "sale": [5, 6], "indirect": 5, "channel": [5, 6, 8], "cellular": 5, "carrier": 5, "net": [5, 6, 9], "38": [5, 6, 7, 8], "ncompetit": 5, "downward": 5, "pressur": [5, 8], "gross": [5, 8], "cycl": [5, 8], "competitor": [5, 6, 7, 8], "compet": [5, 6, 7], "imit": 5, "infring": [5, 7], "intellectu": [5, 7, 8], "marketplac": [5, 8], "nearli": [5, 7], "reput": [5, 8], "expand": [5, 6, 7, 8], "profit": [5, 6, 8, 9], "illegitim": [5, 8], "collabor": [5, 7, 8], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6, 7], "fluctuat": [5, 6], "commonli": [5, 6], "until": [5, 8, 9], "supplier": 5, "matur": 5, "concentr": [5, 6], "enter": [5, 9], "agreement": [5, 6], "suppli": [5, 6, 9], "renew": [5, 6], "nresearch": 5, "nbecaus": 5, "upon": [5, 6, 8], "flow": [5, 6, 9], "acquisit": [5, 6, 8], "nintellectu": 5, "broad": [5, 6, 7, 9], "patent": 5, "copyright": [5, 7], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 8], "personnel": 5, "pursu": [5, 8], "thousand": [5, 7], "durat": 5, "adequ": [5, 8], "nin": 5, "holidai": [5, 8], "fill": 5, "inventori": 5, "older": [5, 7], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7, 8], "talent": [5, 6], "member": [5, 8], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 6, 8], "awai": [5, 6, 8], "ngrowth": 5, "career": 5, "leadership": [5, 8], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 6, 7, 8, 9], "everyon": [5, 7], "gaug": 5, "sentiment": [5, 6, 7, 9], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7, 8], "amend": 5, "sec": [5, 6, 9], "Such": [5, 8], "charg": 5, "investor": [5, 6, 9], "aspx": 5, "websit": [5, 6, 7, 8], "environment": [5, 8], "referenc": [5, 6], "inact": 5, "textual": 5, "unknown": [5, 6, 8], "advers": 5, "conjunct": 5, "consolid": [5, 6], "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 9], "nadvers": 5, "slow": [5, 6], "recess": 5, "unemploy": [5, 6], "inflat": [5, 6], "tighter": 5, "currenc": [5, 6], "monetari": 5, "contract": [5, 7], "logist": 5, "instabl": [5, 8], "inabl": [5, 6], "financ": [5, 6, 7, 8], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": [5, 6], "fair": [5, 8], "instrument": 5, "polit": [5, 8], "disput": 5, "geopolit": 5, "tension": [5, 8], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": [5, 6], "korea": 5, "vietnam": 5, "restrict": [5, 7, 8, 9], "tariff": 5, "export": [5, 6], "portion": [5, 7], "revenu": [5, 6, 9], "restructur": 5, "ceas": 5, "escal": [5, 8], "nmani": 5, "prone": [5, 6, 8], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 8], "attack": [5, 8], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 6, 8], "labor": 5, "nsuch": 5, "imposs": [5, 7], "slowdown": 5, "outag": 5, "neg": [5, 6, 8, 9], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 7, 8], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": [5, 6], "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 8], "minor": [5, 6, 8], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "finish": [5, 8], "destin": 5, "prepay": 5, "termin": [5, 7], "recover": 5, "exposur": [5, 8], "nfutur": 5, "semiconductor": 5, "suffer": [5, 6, 8], "constrain": [5, 7, 9], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 8], "expos": [5, 6, 8], "widespread": [5, 8], "vulner": [5, 6, 8], "compromis": [5, 7, 8], "claim": [5, 6, 7, 8], "intang": 5, "lost": [5, 6, 8], "cancel": 5, "obsolet": 5, "exce": [5, 8], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 6, 8, 9], "carri": [5, 7, 9], "incur": [5, 6], "unpredict": [5, 8], "obsolesc": 5, "forecast": [5, 6, 8], "incorrectli": [5, 8, 9], "extens": [5, 6, 7, 9], "issuanc": 5, "unknowingli": [5, 8], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": [5, 6], "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 6, 8], "storefront": 5, "safari": 5, "union": [5, 8], "eu": [5, 6, 8], "dma": [5, 6], "narrow": [5, 7, 8], "scope": [5, 6, 7, 8], "elimin": [5, 6, 7], "nfailur": 5, "appeal": [5, 6], "subscrib": [5, 6], "nsome": 5, "manner": [5, 6, 8], "nurtur": 5, "nmuch": 5, "chief": [5, 6], "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 7], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 8], "ineffect": 5, "thing": [5, 9], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 8], "confidenti": [5, 7], "encrypt": 5, "But": [5, 6, 8, 9], "behalf": 5, "normal": [5, 6, 8, 9], "investig": [5, 6, 8], "penalti": [5, 7], "frequenc": [5, 7, 8], "actor": [5, 8], "circumv": [5, 8], "obfusc": 5, "forens": 5, "hinder": [5, 9], "recov": 5, "perpetr": 5, "profil": [5, 7], "authent": 5, "hack": [5, 8], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 6, 9], "usernam": 5, "turn": [5, 6, 8, 9], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6, 7], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": [5, 6], "arisen": 5, "ordinari": 5, "cours": [5, 6, 7, 8], "resolv": [5, 7, 8], "sometim": [5, 6], "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": [5, 6], "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 7], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 7, 8], "settl": 5, "uncertain": [5, 6], "disgorg": 5, "remedi": [5, 8], "worldwid": 5, "antitrust": [5, 6], "bill": [5, 6], "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": [5, 6], "repatri": 5, "launder": 5, "tax": [5, 6], "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 7, 8, 9], "agent": [5, 7, 8], "nregulatori": 5, "ban": [5, 8], "nexpect": 5, "increasingli": [5, 6, 7, 8, 9], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7, 8], "lawsuit": [5, 7], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7, 8], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8, 9], "pend": 5, "inquiri": [5, 8], "government": 5, "entiti": [5, 7, 8, 9], "biometr": 5, "notif": 5, "permit": [5, 7, 9], "healthcar": [5, 6, 7], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 8], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 8], "worsen": 5, "A": [5, 7, 8, 9], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 8], "singapor": 5, "organis": 5, "statutori": 5, "valuat": [5, 6], "defer": 5, "bodi": [5, 8], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": [5, 6], "board": [5, 6, 8], "unresolv": 5, "nnone": 5, "threat": [5, 6, 8], "postur": 5, "25": [5, 6, 7, 8], "2016": 5, "coordin": [5, 8], "committe": [5, 8], "oversight": [5, 8], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 9], "center": [5, 8, 9], "formal": [5, 8, 9], "uninstal": 5, "web": [5, 6, 7, 8], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 7], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 8], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 8], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 8], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 8], "enjoin": 5, "extern": [5, 6, 8], "januari": 5, "motion": 5, "oppos": [5, 8], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 8], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 6, 8, 9], "nunder": 5, "njune": 5, "august": [5, 6, 8], "nopen": 5, "negoti": [5, 8], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6, 7], "t42": 5, "910": 5, "t221": 5, "39": [5, 6, 7], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 6, 7], "ntotal": [5, 8], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 7, 8], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 7], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 7, 8], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 6, 8], "renminbi": 5, "yen": [5, 9], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 8], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 8], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 8], "amort": 5, "bundl": 5, "flat": [5, 6], "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 8], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 6, 8], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6, 7], "escrow": 5, "ncapit": 5, "95": [5, 8], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 6, 8], "740": 5, "reconcili": [5, 6], "reconcil": [5, 9], "disaggreg": 5, "prospect": 5, "novemb": [5, 8], "07": [5, 6, 8, 9], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 9], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": [5, 6], "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": 5, "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 7, 9], "343": [5, 8], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 7, 8], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 8], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 7], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": [5, 6], "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 8], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": [5, 6], "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6, 7], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6, 7], "94": [5, 6, 7, 8], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 7], "656": 5, "513": 5, "76": [5, 8], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": [5, 6], "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 8], "234": [5, 8], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 8], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": [5, 6], "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7, 8], "nbasi": 5, "prior": [5, 8], "reclassifi": 5, "nrevenu": 5, "remit": [5, 8], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 6, 8], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 7], "64": [5, 7, 8], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 8], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7, 8], "359": [5, 8], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 7], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": [5, 6], "classifi": [5, 8], "37": [5, 7, 8], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 8], "ntrade": 5, "41": [5, 7, 8], "44": [5, 8], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 7], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 7, 8], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 6, 8], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": [5, 6], "2015": 5, "minist": 5, "juli": [5, 8], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": [5, 6], "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 6, 8], "t84": 5, "428": 5, "603": 5, "483": [5, 8], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 6, 8], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 8], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 8], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 8], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 8], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 8], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": [5, 6], "65": [5, 8], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": [5, 6], "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 6, 7, 8], "holder": [5, 7], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 8], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7, 8], "101": [5, 8], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 8], "456": 5, "78": [5, 7, 8], "59": [5, 8], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 7], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8, 9], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": [5, 6], "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 8], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": [5, 6], "advisor": 5, "ernst": 5, "llp": 5, "auditor": [5, 6], "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 7, 8, 9], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 8, 9], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": [5, 6], "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8, 9], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 7, 9], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 7, 8], "05": [5, 6], "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 7], "250": [5, 8], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": [5, 6], "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 7, 8], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 9], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 7], "turbo": [5, 7, 9], "outlin": [5, 7, 8], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 7, 9], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 9], "radic": 5, "grappl": 5, "safer": [5, 8], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8, 9], "aren": [5, 7], "linear": 5, "absent": [5, 8], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 8], "contend": 5, "rethink": [5, 8], "tutor": 5, "children": [5, 8], "verifi": [5, 6, 7, 9], "predefin": [5, 9], "weren": 5, "kind": [5, 6], "usual": [5, 9], "quantif": 5, "contamin": [5, 8], "unseen": [5, 8], "longitudin": 5, "mostli": [5, 9], "latter": 5, "tailor": [5, 8], "great": [5, 7, 8, 9], "cognit": [5, 6], "misinform": [5, 8], "fabric": [5, 8], "citat": 5, "tempor": [5, 6], "disclaim": 5, "referr": 5, "incorrect": [5, 6, 8], "demograph": [5, 8], "stereotyp": [5, 8], "societ": [5, 8], "pii": [5, 8], "anonym": 5, "leakag": [5, 8], "carryov": 5, "fallaci": 5, "think": [5, 7, 8], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": [5, 6], "compat": [5, 6, 7, 9], "overconfid": [5, 6], "clariti": [5, 6, 8, 9], "audienc": 5, "densiti": 5, "satisfact": [5, 9], "misus": [5, 8], "moral": 5, "co2": 5, "etc": [5, 6, 9], "palm": [5, 7], "easi": [5, 6, 7, 8], "synthet": [5, 7, 8, 9], "templat": [5, 6, 9], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 8], "built": [5, 7, 8, 9], "experiment": [5, 6, 7, 9], "vi": 5, "categor": [5, 7, 8, 9], "intrins": [5, 7], "extrins": 5, "perplex": [5, 7], "downstream": [5, 9], "synthesi": 5, "discret": [5, 6], "prefix": [5, 8], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": [5, 6], "favor": [5, 7, 9], "breviti": 5, "insensit": 5, "semant": [5, 6, 9], "orient": [5, 8], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 8], "computation": [5, 6], "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": [5, 6], "corpu": [5, 6, 7], "ter": 5, "edit": [5, 8], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 8], "bert": 5, "spice": 5, "proposit": [5, 7], "scene": [5, 6, 8], "analyst": [5, 6], "rouge_1": 5, "rouge_2": 5, "ideal": [5, 6, 7, 8, 9], "setup": [5, 7, 8, 9], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 9], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 8], "sentence1": 5, "cat": [5, 8], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 7], "element": [5, 6, 8, 9], "verbos": [5, 6, 7, 8, 9], "peripher": 5, "quit": [5, 6, 7, 9], "convei": 5, "breadth": 5, "Of": [5, 7, 8], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "plot": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 9], "figurecanvasagg": 5, "largest": [5, 7], "sarmah": 5, "granular": [5, 6, 7], "likert": 5, "ensembl": 5, "repeatedli": [5, 6], "fluenci": 5, "refin": 5, "integ": [5, 9], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 7, 9], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 6, 8, 9], "z": 5, "w": [5, 6, 7, 8], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6, 7], "trillion": [5, 7, 9], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 8], "lowest": [5, 7], "firstli": 5, "overhead": [5, 7], "egocentr": 5, "tight": 5, "medicin": [5, 6, 8], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "golden": 5, "earlier": [5, 8], "depict": [5, 8, 9], "multilingu": [5, 7, 8], "arena": 5, "randomli": 5, "customiz": [5, 7, 8], "irrelev": [5, 6], "unhelp": [5, 8], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 7], "critiqu": [5, 8], "elo": 5, "exam": 5, "probe": [5, 8], "certifi": 5, "glue": 5, "entail": [5, 7], "superglu": 5, "successor": 5, "grew": 5, "big": [5, 7], "bench": [5, 7], "srivastava": 5, "truthfulqa": [5, 7], "multitask": 5, "hendryck": [5, 8], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 7, 8, 9], "humanev": [5, 7], "lmsy": 5, "brought": 5, "dialogu": [5, 7], "chiang": 5, "gather": 5, "hundr": [5, 7], "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 7], "contributor": [5, 7, 9], "western": 5, "centric": 5, "divid": [5, 6, 8], "subset": [5, 8], "agnost": 5, "dialect": 5, "render": [5, 8], "crowdsourc": 5, "livebench": 5, "white": [5, 8], "resili": [5, 6, 8], "meaningfulli": 5, "satur": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 9], "2x2": 5, "6x6": 5, "shot": [5, 8, 9], "reductio": 5, "ad": [5, 6, 7, 9], "absurdum": 5, "hard": [5, 6], "10b": 5, "counterfactu": 5, "mileston": [5, 7], "came": 5, "arc": 5, "prize": [5, 8], "chollet": 5, "mike": [5, 6, 8], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 7], "kera": 5, "genuin": 5, "agi": 5, "possess": [5, 6], "elementari": 5, "novelti": 5, "interpol": 5, "synthes": [5, 6], "fly": 5, "brute": [5, 6], "pixel": 5, "color": [5, 6], "unbeaten": 5, "win": [5, 7], "takeawai": 5, "vertic": [5, 8], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": [5, 8], "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 7], "liter": 5, "disturb": 5, "zero": [5, 7, 8, 9], "varianc": [5, 8], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "wish": 5, "bivari": 5, "evaluationtrack": 5, "pipelineparamet": 5, "cache_dir": 5, "max_sampl": 5, "basemodelconfig": 5, "evaluation_track": 5, "model_config": 5, "parallelismmanag": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": [5, 6], "timedelta": [5, 6], "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "float16": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": [5, 6], "vllm": [5, 9], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7, 8], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8, 9], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 7], "command": [5, 6, 7], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 7, 9], "alibaba": [5, 7, 9], "5b": [5, 7, 9], "hui": [5, 7], "allal": [5, 7], "cluster": [5, 6], "noteworthi": [5, 7], "superior": [5, 6, 8], "grain": [5, 6, 7, 9], "salt": [5, 9], "modular": 5, "offici": 5, "revisit": 5, "langchain": [5, 6], "trace": [5, 6], "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7, 8], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 7], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8, 9], "tobia": [5, 9], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 7], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 8], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 8], "insert": [5, 6], "combined_df": 5, "concat": [5, 8], "ignore_index": [5, 8], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 7], "groupbi": [5, 8], "agg": [5, 8], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 6, 8], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": [5, 6], "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 6, 8], "04": [5, 7], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 8], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 8], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "ye": [5, 6, 7, 8, 9], "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 7], "num_request": 5, "numrequest": 5, "num": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 6, 7, 9], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 8], "poorli": 5, "eas": [5, 7, 8, 9], "hf": [5, 7], "plain": [5, 6, 7], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 8], "organiz": 5, "alb": [5, 7], "loubna": [5, 7], "anton": [5, 7], "lozhkov": [5, 7], "bakouch": [5, 7], "gabriel": [5, 7, 8], "mart\u00edn": [5, 7, 8], "bl\u00e1zquez": [5, 7], "lewi": [5, 6, 7], "tunstal": [5, 7], "agust\u00edn": [5, 7], "piquer": [5, 7], "andr": [5, 6, 7], "marafioti": [5, 7], "cyril": [5, 7], "zakka": [5, 7], "leandro": [5, 7], "werra": [5, 7], "wolf": [5, 7], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 8], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 6, 8, 9], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 8], "tworek": [5, 8], "heewoo": [5, 8], "jun": [5, 8], "qime": [5, 8], "henriqu": [5, 8], "pond": [5, 8], "de": [5, 8], "oliveira": [5, 8], "pinto": [5, 8], "harri": [5, 8], "yuri": 5, "burda": 5, "greg": [5, 8], "brockman": [5, 8], "raul": [5, 8], "puri": [5, 8], "gretchen": [5, 8], "krueger": [5, 8], "petrov": [5, 8], "heidi": 5, "khlaaf": 5, "girish": [5, 8], "sastri": [5, 8], "brook": [5, 8], "chan": [5, 8], "grai": [5, 8], "ryder": [5, 8], "mikhail": [5, 8], "pavlov": [5, 8], "alethea": [5, 8], "lukasz": 5, "kaiser": [5, 8], "mohammad": [5, 8], "bavarian": [5, 8], "clemen": [5, 8], "winter": [5, 8], "philipp": 5, "tillet": [5, 8], "felip": [5, 8], "petroski": [5, 8], "dave": [5, 8], "cum": [5, 8], "plappert": 5, "fotio": 5, "chantzi": [5, 8], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 8], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 8], "nikola": [5, 8], "tezak": [5, 8], "babuschkin": [5, 8], "suchir": [5, 8], "balaji": [5, 8], "shantanu": [5, 8], "jain": [5, 8], "hess": [5, 8], "carr": 5, "josh": [5, 8], "achiam": [5, 8], "vedant": 5, "misra": 5, "evan": [5, 7, 8], "morikawa": [5, 8], "matthew": 5, "knight": [5, 8], "mile": [5, 8], "brundag": [5, 8], "mira": [5, 8], "murati": [5, 8], "kati": [5, 8], "mayer": [5, 8], "bob": [5, 8, 9], "mcgrew": [5, 8], "ilya": [5, 8], "sutskev": [5, 8], "wojciech": [5, 8], "zaremba": [5, 8], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 8], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 8], "kannappan": [5, 8], "qian": [5, 8], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 8], "chilton": 5, "aditya": [5, 8], "narayana": 5, "chohla": 5, "brandon": [5, 8, 9], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 8], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 8], "joe": [5, 8], "nudel": 5, "joel": [5, 8], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 7], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": [5, 8], "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 8], "basart": [5, 8], "zou": [5, 8], "manta": [5, 8], "mazeika": [5, 8], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 9], "degener": 5, "1904": 5, "09751": 5, "hug24a": 5, "wiki": [5, 9], "hug24b": 5, "hug24c": 5, "model_doc": 5, "hug24d": 5, "cookbook": [5, 6], "llm_judg": 5, "hug24f": 5, "hyc": [5, 7], "binyuan": [5, 7], "zeyu": [5, 7], "cui": [5, 7], "jiaxi": [5, 7], "dayiheng": [5, 7], "tianyu": [5, 7], "jiajun": [5, 7], "kai": [5, 6, 7, 8], "dang": [5, 7], "coder": [5, 7], "preprint": [5, 7, 9], "2409": [5, 7, 8], "12186": [5, 7], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": [5, 6], "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 8], "cosgrov": 5, "acosta": 5, "nava": [5, 8], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": [5, 6], "ren": [5, 7], "huaxiu": 5, "yao": [5, 8, 9], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": [5, 6], "khattab": [5, 6], "chi": [5, 6, 8, 9], "sang": [5, 8], "shibani": [5, 8], "santurkar": [5, 8], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 7, 8], "stephani": [5, 7, 8], "owain": [5, 7, 8], "mimic": [5, 7, 8], "falsehood": [5, 7, 8], "2109": [5, 7, 8], "07958": [5, 7, 8], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 8], "gorilla": 5, "15334": 5, "pro24": 5, "dev": [5, 6], "ras24": 5, "sebastian": [5, 6], "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 8], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": [5, 6], "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 8], "ng": [5, 8], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 9], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 6, 8], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": [5, 6], "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 7], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 8, 9], "kocurek": 5, "ali": [5, 8], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 8], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 7, 8], "loe": 5, "barret": [5, 8], "zoph": [5, 8], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 7], "raffel": [5, 7], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 8], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 8], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 8], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 8], "tam": [5, 9], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 8], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 8], "parascandolo": [5, 8], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 7], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 8], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 6, 8], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 7, 8], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 8], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 6, 8], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 8], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 8], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 8], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "bosma": [5, 6], "sap": [5, 8], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": [5, 6], "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 8], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 8], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 8], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": [5, 6], "varma": 5, "nanyun": 5, "peng": [5, 8], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": [5, 6], "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 8], "shirish": [5, 8], "keskar": [5, 8], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": [5, 6], "oliv": [5, 8], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": [5, 6], "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": [5, 6], "pezeshkpour": [5, 6], "priti": 5, "oli": 5, "qiaozhu": [5, 6], "mei": [5, 6, 7], "qing": [5, 8], "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 8], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 7], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 8], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 8], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": [5, 8], "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 8], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 8], "shieber": 5, "summer": [5, 8], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 8], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 8], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": [5, 6], "srikumar": [5, 6], "fedu": [5, 8], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 8], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": [5, 6], "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 6, 8], "donald": 5, "metzler": 5, "ed": [5, 6], "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 8], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 8], "ziv": 5, "khalid": [5, 7], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 7], "chengpeng": 5, "chengyuan": [5, 7], "fei": [5, 6, 7], "guant": 5, "haoran": [5, 7], "huan": [5, 7], "jialong": 5, "jialin": 5, "jianhong": [5, 7], "tu": [5, 7], "jianwei": [5, 7], "jianxin": [5, 7], "jin": [5, 6, 8], "jingren": [5, 7], "jinz": 5, "jinzheng": 5, "junyang": [5, 7], "keme": [5, 7], "keqin": [5, 7], "kexin": [5, 7], "mingfeng": [5, 7], "xue": [5, 7, 8], "ni": [5, 6], "pei": [5, 7, 8], "ru": 5, "men": [5, 7], "ruiz": 5, "runji": [5, 7], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 7], "xinyu": [5, 8], "xipin": 5, "xuancheng": [5, 7], "yichang": [5, 7], "wan": [5, 7], "yunfei": 5, "yuqiong": [5, 7], "zhenru": [5, 7], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": [5, 6], "forum": [5, 6], "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 8], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 8], "2306": [5, 8], "05685": 5, "huggingface24": 5, "metaai24": 5, "422": 5, "thank": [5, 7, 9], "doubl": 6, "steve": [6, 8], "lc": 6, "cutoff": 6, "amayuela": 6, "tail": 6, "kotha": 6, "unifi": [6, 7, 9], "realli": 6, "silver": 6, "bullet": 6, "mandatori": 6, "ingest": 6, "preprocess": [6, 7, 9], "parser": [6, 9], "microsoft": [6, 7], "autogen": 6, "powerpoint": 6, "ocr": 6, "exif": 6, "metadata": [6, 7], "docker": [6, 7], "container": [6, 7], "xlsx": 6, "text_cont": 6, "ibm": [6, 7, 8], "docx": 6, "pptx": 6, "layout": 6, "llamaindex": 6, "document_convert": 6, "documentconvert": 6, "export_to_markdown": 6, "presenc": 6, "merril": 6, "lynch": 6, "cio": 6, "outlook": 6, "forecast_file_path": 6, "result_md": 6, "forecast_result_docl": 6, "levenshtein": 6, "distanc": 6, "sequencematch": 6, "difflib": 6, "longest": 6, "levenshtein_similar": 6, "text1": 6, "text2": 6, "max_len": 6, "simple_similar": 6, "ratio": [6, 7], "forecast_result_md": 6, "13985705461925346": 6, "17779960707269155": 6, "readabl": 6, "messi": 6, "2025e": 6, "compos": [6, 7, 8], "financial_vari": 6, "financial_forecast": 6, "econforecast": 6, "extract_prompt": 6, "base_prompt": [6, 9], "extract_from_doc": 6, "twice": 6, "md_financi": 6, "docling_financi": 6, "easier": [6, 7, 8, 9], "gdp": 6, "cpi": 6, "fed": 6, "df_md_forecast": 6, "df_docling_forecast": 6, "despit": [6, 7, 9], "underweight": 6, "neutral": [6, 8], "overweight": 6, "chart": 6, "asset_class_docl": 6, "asset_class_md": 6, "df_md": 6, "df_docl": 6, "true_valu": 6, "df_comparison": 6, "cap": 6, "exempt": 6, "markitdown_accuraci": 6, "docling_accuraci": 6, "93": [6, 7, 8], "unstructur": [6, 7, 9], "sector": 6, "convert_and_export_t": 6, "file_path": 6, "doc_convert": 6, "start_tim": [6, 8], "conv_r": 6, "table_df": 6, "export_to_datafram": 6, "end_tim": 6, "2f": 6, "usd": 6, "wtd": 6, "mtd": 6, "ytd": 6, "djia": 6, "926": 6, "amp": 6, "051": 6, "277": 6, "russel": [6, 8], "2000": 6, "msci": 6, "817": [6, 8], "eaf": 6, "319": 6, "107": 6, "01": [6, 7], "66": [6, 8], "92": 6, "municip": 6, "79": [6, 8], "slight": 6, "discretionari": 6, "yellow": 6, "estat": 6, "orang": 6, "stapl": 6, "constructor": 6, "md_llm": 6, "llm_client": 6, "llm_model": 6, "png": 6, "overview": [6, 9], "showcas": 6, "bond": 6, "crude": 6, "oil": 6, "sit": 6, "648": 6, "ounc": 6, "euro": 6, "tactic": 6, "bofa": 6, "circl": [6, 8], "firecrawl": 6, "mendabl": 6, "crawler": 6, "llamapars": 6, "deserv": 6, "arulkumaran": 6, "karthikeyan": 6, "almasri": 6, "fetch": 6, "spreadsheet": 6, "literatur": [6, 8], "canon": 6, "succinct": [6, 7], "authorship": 6, "book_url": 6, "intro": 6, "structured_output": 6, "chapter_url": 6, "chapter_id": 6, "dimension": 6, "chromadb": 6, "weaviat": 6, "faiss": 6, "milvu": 6, "chroma_cli": 6, "aw": [6, 7, 8], "azur": 6, "gcp": 6, "create_collect": 6, "taming_llm": 6, "argument": [6, 7, 8, 9], "query_collect": 6, "query_text": 6, "n_result": 6, "enquir": 6, "related": 6, "leaderboard": [6, 7, 8], "2024i": 6, "behind": [6, 8], "minilm": 6, "l6": 6, "v2": [6, 7, 8], "sentence_transform": 6, "2024f": 6, "sentencetransform": 6, "embedding_model": 6, "docs_to_emb": 6, "encod": [6, 7, 8, 9], "384": [6, 8], "0000": 6, "4402": 6, "3022": 6, "4028": 6, "6606": 6, "5807": 6, "6313": 6, "matrix": [6, 7, 8], "heatmap": 6, "wise": [6, 9], "dataset": [6, 9], "tree": [6, 8, 9], "kd": 6, "ball": 6, "partit": 6, "hierarch": [6, 8], "curs": 6, "hnsw": 6, "promin": [6, 8], "lsh": 6, "hash": 6, "bucket": 6, "sacrific": [6, 7], "chroma": 6, "tutori": 6, "crossencod": 6, "512": 6, "passag": [6, 8], "argmax": 6, "52623": 6, "328738": 6, "750055": 6, "ideia": 6, "rake": 6, "topk": [6, 9], "rag_system_prompt_templ": 6, "user_prompt_templ": 6, "popul": 6, "rag_qa": 6, "res_rerank": 6, "invok": [6, 9], "alammar": 6, "diamant": 6, "kimothi": 6, "athinaai": 6, "envis": 6, "incomplet": [6, 7, 8], "unreli": [6, 7], "acut": 6, "unverifi": 6, "intric": 6, "hamper": 6, "raga": 6, "misinterpret": 6, "appar": [6, 8], "shed": 6, "light": 6, "misl": 6, "gemini": [6, 7], "outperform": [6, 7], "rout": 6, "hybrid": 6, "retrollm": 6, "loft": 6, "hop": 6, "gecko": 6, "vectordb": 6, "llama_pars": 6, "llx": 6, "result_typ": 6, "load_data": 6, "doc1": 6, "doc2": 6, "llama_index": 6, "vectorstoreindex": 6, "simpledirectoryread": 6, "vector_stor": 6, "chromavectorstor": 6, "storagecontext": 6, "db": 6, "persistentcli": 6, "chroma_db": 6, "chroma_collect": 6, "get_or_create_collect": 6, "storage_context": 6, "from_default": 6, "from_docu": 6, "query_engin": 6, "as_query_engin": 6, "prototyp": [6, 7], "complement": 6, "reassembl": 6, "breakdown": [6, 8], "fewer": [6, 7, 8], "furthermor": [6, 9], "zenml": 6, "max_output_token": 6, "statement": [6, 8], "10k": 6, "diagram": [6, 8], "charactertextsplitt": 6, "tiktoken": [6, 8], "sequenti": 6, "newlin": 6, "cheap": 6, "speciali": 6, "nltk": 6, "spaci": 6, "talk": 6, "theme": [6, 7, 8], "splitter": 6, "surpass": 6, "get_chunk": 6, "chunk_siz": 6, "chunk_overlap": 6, "langchain_text_splitt": 6, "text_splitt": 6, "from_tiktoken_encod": 6, "split_text": 6, "persona": 6, "langchain_cor": [6, 9], "prompttempl": 6, "get_base_prompt_templ": 6, "from_templ": 6, "llmchain": 6, "output_pars": 6, "stroutputpars": 6, "langchain_commun": 6, "chat_model": 6, "chatlitellm": 6, "get_llm_chain": 6, "prompt_templ": [6, 9], "llm_chain": [6, 9], "api_key_label": 6, "upper": 6, "_api_kei": 6, "get_dynamic_prompt_templ": 6, "get_dynamic_prompt_param": 6, "prompt_param": 6, "part_idx": 6, "total_part": 6, "chat_context": 6, "param": 6, "dynamic_prompt_param": 6, "concaten": 6, "generate_report": 6, "input_cont": 6, "llm_model_nam": 6, "report_part": 6, "num_part": 6, "dinam": 6, "priovid": 6, "cummul": 6, "max_chunk_s": 6, "max_chunk_overlap": 6, "apple_report": 6, "report_cont": 6, "report_lin": 6, "splitlin": 6, "total_lin": 6, "quarter_lin": 6, "top_port": 6, "bottom_port": 6, "uncov": [6, 8, 9], "delv": 6, "consol": 6, "reaction": 6, "disciplin": 6, "subhead": 6, "depth": [6, 8], "2m": [6, 7], "harvard": [6, 7], "enrol": 6, "gov": [6, 8], "1039": 6, "birth": [6, 8], "democraci": 6, "tuesdai": 6, "magna": 6, "carta": 6, "trudg": 6, "dens": 6, "conversation": 6, "knowledge_bas": 6, "add_knowledge_bas": 6, "add_cit": 6, "bool": [6, 8], "num_quest": 6, "input_memori": 6, "response_memori": 6, "urls_memori": 6, "extractor": 6, "cic": 6, "citabl": 6, "corpora": 6, "formatted_cont": 6, "reference_id": 6, "wrapper": [6, 9], "content_gener": 6, "user_instruct": 6, "llmbackend": 6, "cache_ttl": 6, "cachedcont": 6, "display_nam": 6, "due_knowledge_bas": 6, "system_instruct": 6, "compose_prompt": 6, "conversation_config": 6, "ttl": 6, "generativemodel": 6, "from_cached_cont": 6, "cached_cont": 6, "quiz_inst": 6, "professor": 6, "difficulti": [6, 8], "syllabu": 6, "kennedi": 6, "inaugur": 6, "lincoln": 6, "gettysburg": 6, "liberti": 6, "mayflow": 6, "abraham": 6, "gutenberg": 6, "kb": 6, "epub": 6, "pg": 6, "gemini_duo": 6, "genai_duo": 6, "duo": 6, "usage_metadata": 6, "38470": 6, "anytim": 6, "shap": 6, "mckechni": 6, "study_refer": 6, "pg10000": 6, "65363": 6, "pg65363": 6, "quizz": 6, "problemat": [6, 8], "simpler": [6, 7, 9], "ag24": 6, "jai": [6, 8], "1098150969": 6, "9781098150952": 6, "awp": 6, "alfonso": 6, "liangm": 6, "pan": [6, 8], "wenhu": 6, "lun": 6, "ku": 6, "editor": [6, 8], "acl": [6, 8], "6416": 6, "6432": 6, "bangkok": 6, "thailand": 6, "aclanthologi": [6, 8], "383": 6, "18653": [6, 8], "v1": [6, 7, 8], "bcv14": 6, "aaron": 6, "courvil": 6, "vincent": 6, "1206": 6, "5538": 6, "dia24": 6, "nir": 6, "nirdiam": 6, "rag_techniqu": 6, "hrk": 6, "koleczek": 6, "arshdeep": 6, "franklin": 6, "sadid": 6, "hasan": 6, "10541": 6, "jlz": 6, "mathew": 6, "erik": [6, 8], "lindgren": 6, "matei": 6, "zaharia": 6, "carbin": 6, "drozdov": 6, "drown": 6, "11767": 6, "kim24": 6, "9781633435858": 6, "meap": 6, "ksr24": 6, "suha": 6, "springer": 6, "aditi": 6, "raghunathan": 6, "twelfth": 6, "vrhif2hsrm": 6, "lcd": 6, "jinhyuk": 6, "zhuyun": 6, "dheeru": 6, "dua": 6, "devendra": 6, "sachan": 6, "boratko": 6, "luan": 6, "s\u00e9bastien": 6, "arnold": 6, "perot": 6, "siddharth": 6, "dalmia": 6, "hexiang": 6, "panupong": 6, "pasupat": 6, "aida": 6, "amini": 6, "cole": 6, "riedel": 6, "iftekhar": 6, "naim": 6, "ming": [6, 8], "guu": 6, "subsum": 6, "sql": 6, "13121": 6, "lpp": 6, "aleksandra": 6, "piktu": 6, "fabio": [6, 8], "petroni": 6, "vladimir": 6, "karpukhin": 6, "heinrich": 6, "k\u00fcttler": 6, "tau": 6, "yih": 6, "rockt\u00e4schel": 6, "douw": 6, "kiela": 6, "2005": 6, "11401": 6, "ljz": 6, "xiaoxi": 6, "jiaji": 6, "yongkang": 6, "zhonghua": 6, "zhicheng": 6, "dou": 6, "empow": [6, 8], "11919": 6, "llz": 6, "zhuowan": 6, "cheng": [6, 8, 9], "mingyang": 6, "benderski": 6, "16833": 6, "lfc": 6, "zhihang": 6, "chao": 6, "rongxin": 6, "yaowu": 6, "jiep": 6, "16434": 6, "lla24": 6, "nbgc24": 6, "shiyu": 6, "kepe": 6, "bi": 6, "jiafeng": 6, "guo": [6, 8], "xueqi": 6, "11375": 6, "11388": 6, "675": 6, "tdw": 6, "jiejun": 6, "mang": 6, "weipeng": 6, "ji": 6, "htmlrag": 6, "02959": 6, "ww": 6, "dale": 6, "schuurman": 6, "ichter": 6, "quoc": 6, "2201": [6, 8], "11903": 6, "wip": 6, "yunshu": 6, "hayat": 6, "iso": 6, "bhutani": 6, "estevam": 6, "hruschka": 6, "2309": [6, 8], "07382": 6, "zlj": 6, "yun": [6, 9], "metacognit": 6, "1453": 6, "1463": 6, "ny": [6, 8, 9], "usa": [6, 8, 9], "machineri": [6, 9], "1145": [6, 8, 9], "3589334": 6, "3645481": 6, "anthropic4a": 6, "athinaai24": 6, "recip": 6, "athina": 6, "chromadb4a": 6, "chromadb4b": 6, "trychroma": 6, "huggingface4f": 6, "huggingface4i": 6, "mteb": 6, "ibmresearch24": 6, "ds4sd": 6, "langchain24": 6, "how_to": 6, "llamaindex24": 6, "mendableai24": 6, "mendableai": 6, "merrilllynch24": 6, "weekli": 6, "olui2": 6, "gwmol": 6, "microsoft24": 6, "openai24": 6, "ragas24": 6, "getstart": 6, "rag_evalu": 6, "unstructuredio24": 6, "zenml24": 6, "llmop": 6, "di": 7, "hunter": 7, "photo": 7, "email": 7, "hipaa": 7, "properti": [7, 8], "gdpr": 7, "strict": [7, 8, 9], "iot": 7, "impract": 7, "slm": 7, "viabl": 7, "sensor": 7, "interconnect": 7, "frontend": 7, "garner": 7, "yourself": 7, "bedrock": 7, "sambanova": 7, "sla": 7, "veloc": 7, "roadmap": 7, "commodit": 7, "winner": 7, "loser": 7, "condens": 7, "clean": 7, "2024t": 7, "versatil": 7, "72b": 7, "med": 7, "bloomberggpt": 7, "underw": 7, "adept": 7, "toxigen": 7, "alnajjar": 7, "13b": [7, 8], "32b": 7, "feasibl": 7, "modal": 7, "diagnosi": 7, "patient": 7, "necessit": 7, "deepseek": 7, "flagship": 7, "405b": 7, "pack": 7, "v3": [7, 8], "671": 7, "moe": 7, "mixtur": 7, "3x": [7, 8], "fraction": 7, "domin": 7, "cautiou": 7, "cautious": 7, "isol": [7, 8], "cpot": 7, "cpit": 7, "tco": 7, "tpot": 7, "ttft": 7, "sent": [7, 8], "gpqa": 7, "median": 7, "afford": 7, "meanwhil": 7, "lite": 7, "micro": 7, "cent": 7, "1m": 7, "cheapest": 7, "phi": 7, "half": [7, 8], "permiss": [7, 8], "apach": 7, "700m": 7, "100m": 7, "gemma": [7, 9], "grown": 7, "withdraw": 7, "unclear": 7, "15t": 7, "8t": 7, "fineweb": 7, "penedo": 7, "96": [7, 8], "crawl": 7, "snapshot": 7, "codebas": 7, "ablat": 7, "vital": [7, 8], "favorit": 7, "spawn": 7, "ultrachat": 7, "2024u": 7, "created_job": 7, "fine_tun": 7, "training_fil": 7, "file_id": 7, "ultrachat_chunk_train": 7, "validation_fil": 7, "ultrachat_chunk_ev": 7, "training_step": 7, "0001": 7, "auto_start": 7, "job_id": 7, "toolkit": [7, 8], "sft": 7, "nemo": [7, 8], "codestr": 7, "2024v": 7, "enough": 7, "rewrit": 7, "smolvlm": 7, "mlx": [7, 9], "mlc": 7, "peft": 7, "programm": 7, "graphic": [7, 8], "vram": 7, "mathbf": 7, "x_1": [7, 9], "x_2": [7, 9], "x_n": [7, 9], "x_": [7, 9], "\u03b8": 7, "cerebra": 7, "mozilla": 7, "gerganov": 7, "georgi": 7, "overwhelm": [7, 9], "manifesto": 7, "enjoy": 7, "bog": 7, "exploratori": 7, "hacker": 7, "Will": [7, 8], "prematur": 7, "besid": 7, "lighter": 7, "ggml": [7, 9], "disk": 7, "backward": 7, "2024x": 7, "repo": 7, "compil": 7, "linux": 7, "sudo": 7, "apt": 7, "cmake": 7, "bind": 7, "betlen": 7, "cnv": 7, "llamacpp": 7, "ctrl": 7, "interject": 7, "philosoph": 7, "debat": 7, "fulfil": 7, "happi": 7, "responsibli": 7, "bye": 7, "goodby": 7, "port": 7, "127": 7, "curl": [7, 9], "localhost": 7, "bearer": 7, "finish_reason": 7, "deepli": 7, "1734627879": 7, "completion_token": 7, "total_token": 7, "chatcmpl": 7, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 7, "prompt_n": 7, "prompt_m": 7, "132": 7, "prompt_per_token_m": 7, "prompt_per_second": 7, "77619878666999": 7, "predicted_n": 7, "predicted_m": 7, "1700": 7, "654": [7, 9], "predicted_per_token_m": 7, "36882142857143": 7, "predicted_per_second": 7, "92850867960208": 7, "gbnf": [7, 9], "8pm": 7, "appointmenttim": 7, "appointmentdetail": 7, "handi": 7, "model_path": 7, "llama_cpp": 7, "create_chat_complet": 7, "occupi": 7, "activist": 7, "justin": [7, 8], "tunnei": 7, "ocho": 7, "appach": 7, "cosmopolitan": 7, "libc": 7, "portabl": 7, "durabl": 7, "usabl": [7, 8, 9], "tinyllama": 7, "wget": 7, "jartin": 7, "q5_k_m": 7, "renam": 7, "ex": 7, "chmod": 7, "nobrows": 7, "registri": 7, "nativ": [7, 9], "trai": 7, "familiar": 7, "bare": 7, "ssfl": 7, "sh": [7, 9], "Or": 7, "11434": 7, "chatrespons": 7, "easiest": 7, "rich": [7, 8], "playground": 7, "simultan": [7, 8], "importantli": [7, 9], "intuit": 7, "beginn": 7, "tensorrt": 7, "trt": 7, "latex": 7, "voic": 7, "pwa": 7, "medium": [7, 8], "gpt4all": 7, "rbac": 7, "q4_k": 7, "q6_k": 7, "mib": 7, "wikitext": 7, "salesforc": 7, "wikipedia": [7, 9], "min_prompt_length": 7, "input_texts_raw": 7, "began": 7, "2010": 7, "valkyria": 7, "chronicl": 7, "forgiv": 7, "newcom": 7, "raita": 7, "honjou": 7, "hitoshi": 7, "sakimoto": 7, "takeshi": 7, "ozawa": 7, "writer": 7, "sung": 7, "escap": 7, "escaped_text": 7, "block_scal": 7, "block": [7, 8], "parenthes": 7, "block_min": 7, "formula": 7, "superblock": 7, "5625": 7, "ieee": 7, "754": 7, "ppl": 7, "exp": 7, "sum_": 7, "log_2": 7, "x_i": [7, 9], "avg": 7, "_i": 7, "corr": 7, "ln": [7, 9], "kullback": 7, "leibler": 7, "entropi": 7, "logit": 7, "d_": 7, "softmax": [7, 9], "sum": 7, "kld": 7, "q2_kresult": 7, "q6": 7, "004": 7, "q2": 7, "112": 7, "q4": 7, "smallest": 7, "390": 7, "67": [7, 8], "81": [7, 8], "462": 7, "614": 7, "170": 7, "q4_k_m": 7, "thread": 7, "16x": 7, "85x": 7, "79x": 7, "ubuntu": 7, "lt": 7, "x86_64": 7, "gnu": 7, "intel": 7, "i7": 7, "8550u": 7, "15gib": 7, "samsung": 7, "ssd": 7, "970": 7, "evo": 7, "500gb": 7, "1170": 7, "meant": 7, "ai4c": 7, "ai4a": 7, "paperswithcod": [7, 8], "ana24a": 7, "artificialanalysi": 7, "ana24b": 7, "ana24c": 7, "bc24": 7, "andrei": [7, 8], "abetlen": 7, "dee24": 7, "blob": [7, 9], "deepseek_v3": 7, "gc24": 7, "ggerganov": [7, 9], "readm": [7, 9], "gc4a": 7, "gc4b": 7, "hug4": 7, "optimum": 7, "concept_guid": 7, "hug4t": 7, "hug4u": 7, "200k": 7, "ultrachat_200k": 7, "hug4v": 7, "blogpost": 7, "pka": 7, "guilherm": 7, "hynek": 7, "kydl\u00ed\u010dek": 7, "decant": 7, "finest": 7, "17557": 7, "qwe4b": 7, "qy": 7, "beichen": 7, "tingyu": 7, "su": 7, "zihan": 7, "qiu": 7, "15115": 7, "rev24": 7, "nyt": 7, "harvardlawreview": 7, "timess": 7, "zwa": 7, "wael": 7, "geoffrei": [7, 8], "angu": 7, "arnav": 7, "jefferi": 7, "kinnison": 7, "sherstinski": 7, "piero": 7, "molino": 7, "travi": 7, "addair": 7, "devvret": 7, "310": 7, "2405": 7, "00732": 7, "huggingface4xa": 7, "huggingface4xb": 7, "ibmthink24": 7, "lmstudio24": 7, "lmstudio": 7, "metaai4c": 7, "mozillaocho24": 7, "salesforce24": 7, "immens": 8, "commonplac": 8, "spur": 8, "hartvigsen": 8, "societi": 8, "alarm": 8, "openli": 8, "dolli": 8, "llama2": [8, 9], "emb": 8, "generalist": 8, "injustic": 8, "inequ": 8, "undermin": 8, "perpetu": 8, "displac": 8, "eros": 8, "fake": 8, "deepfak": 8, "distrust": 8, "cyberattack": 8, "spread": 8, "disinform": 8, "inadvert": 8, "interven": 8, "irrevers": 8, "uncheck": 8, "extinct": 8, "race": 8, "incentiv": 8, "shortcut": 8, "stress": 8, "urgent": 8, "reorient": 8, "siam": 8, "edgington": 8, "jailbreak": 8, "promptcraft": 8, "stealth": 8, "sutton": 8, "subtl": 8, "subtleti": 8, "exception": 8, "phrase": 8, "evad": 8, "hqve": 8, "frer": 8, "hplidai": 8, "pl": 8, "hyperion": 8, "coast": 8, "redwood": 8, "tallest": 8, "routin": 8, "prejudic": 8, "gallego": 8, "leak": 8, "poison": 8, "intention": 8, "inject": 8, "mislead": 8, "exabeam": 8, "finra": 8, "3110": 8, "mandat": 8, "supervisori": 8, "unicef": 8, "contest": 8, "congress": 8, "enact": 8, "pictur": [8, 9], "sound": 8, "territori": 8, "oversea": 8, "chines": 8, "legitim": 8, "consent": 8, "complaint": 8, "cooper": 8, "extraterritori": 8, "offshor": 8, "draft": 8, "voluntari": 8, "player": 8, "prepared": 8, "compris": 8, "cbrn": 8, "persuas": 8, "autonomi": 8, "gradat": 8, "scorecard": 8, "elig": 8, "advisori": 8, "sag": 8, "shut": 8, "prerequisit": 8, "harden": 8, "asl": 8, "biosafeti": 8, "elev": 8, "warn": [8, 9], "bioweapon": 8, "compartment": 8, "4x": 8, "jump": 8, "paus": 8, "deepmind": 8, "biosecur": 8, "buffer": 8, "formul": [8, 9], "calibr": 8, "taxonomi": 8, "llamaguard": 8, "20241022": 8, "5x": 8, "alaga": 8, "substandard": 8, "oxford": 8, "wachter": 8, "blur": 8, "ill": 8, "stifl": 8, "suscept": 8, "aadc": 8, "outset": 8, "curricula": 8, "adversari": 8, "thoroughli": 8, "lm": [8, 9], "undergo": 8, "280b": 8, "cai": [8, 9], "enshrin": 8, "evas": 8, "resort": 8, "avenu": 8, "cambria": 8, "inherit": 8, "influenti": 8, "debias": 8, "plausibl": 8, "occurr": 8, "phish": 8, "clarifi": 8, "toler": 8, "checklist": 8, "abus": 8, "ux": 8, "architect": 8, "retrofit": 8, "promptli": 8, "dashboard": 8, "misalign": 8, "star": 8, "postpon": 8, "combat": 8, "counter": 8, "traffic": 8, "frustrat": 8, "workaround": 8, "silo": 8, "hierarchi": 8, "mcq": 8, "regex": [8, 9], "joint": 8, "facet": 8, "purpl": 8, "opensafetylab": 8, "salad_bench_dataset": 8, "base_set": 8, "gptfuzzer": 8, "auto": [8, 9], "qid": 8, "o1": 8, "supremaci": 8, "o53": 8, "o14": 8, "o5": 8, "o65": 8, "plagiar": 8, "o16": 8, "o6": 8, "o47": 8, "campaign": 8, "o12": 8, "o52": 8, "surveil": 8, "spous": 8, "know": 8, "o13": 8, "ncount": 8, "21318": 8, "8756": 8, "6486": 8, "o2": 8, "1717": 8, "o4": 8, "1477": 8, "o3": 8, "socioeconom": 8, "851": 8, "int64": 8, "gen": 8, "15433": 8, "hh": 8, "4184": 8, "659": 8, "advbench": 8, "230": 8, "189": 8, "toxicchat": 8, "anyth": 8, "misconcept": 8, "ingrain": 8, "mc1": 8, "singular": 8, "choices4": 8, "mc2": 8, "set4": 8, "scorer": 8, "correctli": [8, 9], "truthful_qa": 8, "truthfulqa_dataset": 8, "multiple_choic": 8, "best_answ": 8, "correct_answ": 8, "incorrect_answ": 8, "watermelon": 8, "digest": 8, "noth": 8, "stomach": 8, "sick": 8, "wonderopoli": 8, "wonder": 8, "belli": 8, "swallow": 8, "dream": 8, "die": 8, "indigest": 8, "unconsci": 8, "excret": 8, "asr": 8, "r2d2": 8, "wider": [8, 9], "mass": 8, "destruct": 8, "asynchron": 8, "webpurifi": 8, "protectai": 8, "comprehend": 8, "amazon": 8, "nvidia": [8, 9], "keyword": 8, "toolset": 8, "nemmo": 8, "synchron": 8, "nemoguardrail": 8, "llmrail": 8, "railsconfig": 8, "from_path": 8, "rail": 8, "hello": 8, "ministr": 8, "mistralai": 8, "mistral_api_kei": 8, "moderate_chat": 8, "omni": 8, "pprint": 8, "to_json": 8, "threaten": 8, "illicit": 8, "granit": 8, "guardian": 8, "consortium": 8, "11b": 8, "begin_of_text": 8, "start_header_id": 8, "end_header_id": 8, "unsafe_categori": 8, "user_message_1": 8, "model_answer_1": 8, "comma": 8, "eot_id": 8, "eom_id": 8, "denot": 8, "s1": 8, "s2": 8, "s3": 8, "s4": 8, "s5": 8, "defam": 8, "s6": 8, "s7": 8, "s8": 8, "s9": 8, "s10": 8, "s11": 8, "s12": 8, "s13": 8, "padhi": 8, "atla": 8, "2b": 8, "hap": 8, "38m": 8, "125m": 8, "shieldgemma": 8, "judge_prompt": 8, "american": 8, "vandal": 8, "underag": 8, "drink": 8, "vulgar": 8, "obscen": 8, "racism": 8, "derogatori": 8, "firearm": 8, "safety_scor": 8, "IN": 8, "borderlin": 8, "verdict": 8, "boolean": [8, 9], "brief": 8, "rational": 8, "paramount": [8, 9], "evenli": 8, "good_sampl": 8, "bad_sampl": 8, "2024z": 8, "surg": 8, "scam": 8, "get_profanity_sampl": 8, "show_stat": 8, "current_dir": 8, "getcwd": 8, "data_path": 8, "profanity_en": 8, "random_st": 8, "ncategori": 8, "category_count": 8, "category_1": 8, "1f": 8, "profanity_sampl": 8, "nsampl": 8, "anatomi": 8, "slur": 8, "182": 8, "bodili": 8, "fluid": 8, "excrement": 8, "insult": 8, "mental": 8, "disabl": 8, "jap": 8, "babi": [8, 9], "batter": 8, "crazi": 8, "sob": 8, "fukka": 8, "sh1t3": 8, "get_salad_sampl": 8, "salad_sampl": 8, "tortur": 8, "porn": 8, "sin": 8, "sight": 8, "god": 8, "embezzl": 8, "xanax": 8, "alcohol": 8, "get_good_sampl": 8, "min_scor": 8, "reichstag": 8, "profanity_data": 8, "salad_data": 8, "good_data": 8, "all_data": 8, "prompt_sampl": 8, "is_unsaf": 8, "counti": 8, "holli": 8, "ridg": 8, "nc": 8, "town": 8, "onslow": 8, "carolina": 8, "diver": 8, "underwat": 8, "maze": 8, "coral": 8, "treasur": 8, "vivid": 8, "sensori": 8, "emot": 8, "labyrinthin": 8, "reef": 8, "suspens": 8, "obstacl": 8, "creatur": 8, "nomin": 8, "nobel": 8, "love": 8, "logo": 8, "thief": 8, "rob": 8, "famou": 8, "nstatist": 8, "source_stat": 8, "type_stat": 8, "plug": 8, "safetyvalid": 8, "validationresult": 8, "dataclass": 8, "abstractmethod": 8, "llmguardvalid": 8, "scanner": 8, "bantop": 8, "llm_guard": 8, "input_scann": 8, "scan_prompt": 8, "matchtyp": 8, "default_banned_top": 8, "banned_top": 8, "super": 8, "banned_topics_scann": 8, "use_onnx": 8, "toxicity_scann": 8, "match_typ": 8, "fail_fast": 8, "unsafe_scann": 8, "gun": 8, "cool": 8, "hunt": 8, "deer": 8, "dad": 8, "mistralvalid": 8, "hate_and_discrimin": 8, "violence_and_threat": 8, "dangerous_and_criminal_cont": 8, "selfharm": 8, "openaivalid": 8, "attr": 8, "dir": 8, "getattr": 8, "illicit_viol": 8, "llmjudgevalid": 8, "prompt_path": 8, "llmjudg": 8, "filenotfounderror": 8, "slice": 8, "elaps": 8, "score_valid": 8, "id_": 8, "validator_inst": 8, "validation_result": 8, "elapsed_tim": 8, "prompt_sample_id": 8, "validator_nam": 8, "scoring_prompt": 8, "scoring_result": 8, "01536": 8, "34098": 8, "497136": 8, "546416": 8, "calculate_validator_metr": 8, "bad_sourc": 8, "good_sourc": 8, "tpr": 8, "fpr": 8, "f1_score": 8, "tn": 8, "fp": 8, "fn": 8, "tp": 8, "255": 8, "74": 8, "238": 8, "130": 8, "256": 8, "149": 8, "213": 8, "190": 8, "235": 8, "63": [8, 9], "222": 8, "480": 8, "157": 8, "487": 8, "495": 8, "482": 8, "667": 8, "248": 8, "466": 8, "143": 8, "355": 8, "slowest": 8, "room": 8, "false_posit": 8, "left_on": 8, "right_on": 8, "persian": 8, "teenag": 8, "\u0686\u0627\u0644\u0634": 8, "\u0645\u0648\u0645\u0648": 8, "\u06cc\u06a9": 8, "\u062d\u0642\u0647": 8, "\u0648": 8, "\u0627\u0641\u0633\u0627\u0646\u0647": 8, "\u0645\u062d\u0644\u06cc": 8, "\u0627\u0633\u062a": 8, "\u06a9\u0647": 8, "\u0634\u0627\u06cc\u0639\u0647": 8, "\u0622\u0646": 8, "\u062f\u0631": 8, "\u0634\u0628\u06a9\u0647": 8, "\u0647\u0627\u06cc": 8, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 8, "\u067e\u062e\u0634": 8, "\u0634\u062f": 8, "\u06af\u0632\u0627\u0631\u0634": 8, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 8, "\u0628\u0627": 8, "\u0646\u0627\u0645": 8, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 8, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 8, "\u0631\u0627": 8, "\u0628\u0647": 8, "\u0627\u0639\u0645\u0627\u0644": 8, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 8, "\u0645\u0627\u0646\u0646\u062f": 8, "\u062e\u0648\u062f\u0632\u0646\u06cc": 8, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 8, "\u062a\u0631\u063a\u06cc\u0628": 8, "\u0645\u06cc": 8, "\u06a9\u0646\u062f": 8, "\u0648\u062c\u0648\u062f": 8, "\u0635\u062d\u0628\u062a": 8, "\u0632\u06cc\u0627\u062f\u06cc": 8, "\u0645\u0648\u0631\u062f": 8, "\u0627\u06cc\u0646": 8, "\u062a\u0639\u062f\u0627\u062f": 8, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 8, "\u0648\u0627\u0642\u0639\u06cc": 8, "\u0628\u0633\u06cc\u0627\u0631": 8, "\u06a9\u0645": 8, "\u0628\u0648\u062f": 8, "\u067e\u0644\u06cc\u0633": 8, "\u0635\u062f\u0645\u0647": 8, "\u062f\u06cc\u062f\u0646": 8, "\u062f\u0644\u06cc\u0644": 8, "\u062a\u0623\u062b\u06cc\u0631": 8, "\u0645\u0633\u062a\u0642\u06cc\u0645": 8, "\u067e\u062f\u06cc\u062f\u0647": 8, "\u062a\u0623\u06cc\u06cc\u062f": 8, "\u0646\u06a9\u0631\u062f\u0647": 8, "\u062a\u0631\u0633": 8, "\u0646\u06af\u0631\u0627\u0646\u06cc": 8, "\u0627\u06cc\u062c\u0627\u062f": 8, "\u0634\u062f\u0647": 8, "\u0628\u06cc\u0634\u062a\u0631": 8, "\u0627\u0632": 8, "\u062e\u0648\u062f": 8, "\u0631\u0633\u0627\u0646\u0647": 8, "\u0647\u0627": 8, "\u0637\u0648\u0631\u06cc": 8, "\u062e\u06cc\u0631\u06cc\u0647": 8, "\u0647\u0634\u062f\u0627\u0631": 8, "\u062f\u0627\u062f\u0646\u062f": 8, "\u0622\u0633\u06cc\u0628": 8, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 8, "\u0645\u062d\u062a\u0648\u0627\u06cc": 8, "\u062e\u0634\u0648\u0646\u062a": 8, "\u0622\u0645\u06cc\u0632": 8, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 8, "\u06af\u0641\u062a\u0647": 8, "\u0634\u0648\u062f": 8, "\u0627\u0648\u0644\u06cc\u0646": 8, "\u0628\u0627\u0631": 8, "\u0633\u0627\u0644": 8, "\u06f2\u06f0\u06f1\u06f8": 8, "\u067e\u0633": 8, "\u0622\u0646\u06a9\u0647": 8, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 8, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 8, "\u062e\u0628\u0631": 8, "\u062f\u062e\u062a\u0631": 8, "\u06f1\u06f2": 8, "\u0633\u0627\u0644\u0647": 8, "\u062f\u0627\u062f": 8, "\u0645\u0648\u0636\u0648\u0639": 8, "\u062c\u0647\u0627\u0646\u06cc": 8, "\u062a\u0628\u062f\u06cc\u0644": 8, "\u0645\u062c\u0633\u0645\u0647": 8, "\u0647\u0646\u0631\u0645\u0646\u062f": 8, "\u0698\u0627\u067e\u0646\u06cc": 8, "\u0647\u0631": 8, "\u0686\u0646\u062f": 8, "\u0634\u0627\u06cc\u062f": 8, "\u0646\u06af\u0627\u0647": 8, "\u0628\u0639\u0636\u06cc": 8, "\u0632\u06cc\u0628\u0627": 8, "\u0646\u0628\u0627\u0634\u062f": 8, "\u0627\u0645\u0627": 8, "\u06a9\u0627\u0645\u0644\u0627": 8, "\u0628\u06cc": 8, "\u062e\u0637\u0631": 8, "\u0627\u06cc\u0631\u0627\u0646": 8, "\u0645\u062f\u062a": 8, "\u0628\u06cc\u0646": 8, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 8, "\u0645\u0637\u0631\u062d": 8, "\u0633\u0627\u0644\u06cc": 8, "\u0633\u0631\u0627\u0633\u0631": 8, "\u062c\u0647\u0627\u0646": 8, "\u0645\u0634\u0627\u0628\u0647\u06cc": 8, "\u0628\u0631\u0627\u06cc": 8, "\u0648\u0627\u0644\u062f\u06cc\u0646": 8, "\u06a9\u0631\u062f\u0647": 8, "\u0627\u0641\u0631\u0627\u062f": 8, "\u0686\u0647": 8, "\u06a9\u0627\u0631\u06cc": 8, "\u062f\u0639\u0648\u062a": 8, "tourist": 8, "distress": 8, "polish": 8, "galician": 8, "dzisiaj": 8, "szwecji": 8, "innych": 8, "bogatych": 8, "krajach": 8, "ludzi": 8, "u\u017cywaj\u0105": 8, "mn\u00f3stwo": 8, "najr\u00f3\u017cniejszych": 8, "urz\u0105dze\u0144": 8, "hox": 8, "suecia": 8, "outro": 8, "pa\u00eds": 8, "rico": 8, "xent": 8, "moita": 8, "m\u00e1quina": 8, "diferent": 8, "\u0142\u00f3dka": 8, "zaczyna": 8, "ton\u0105\u0107": 8, "tury\u015bci": 8, "wracaj\u0105": 8, "statek": 8, "dom\u00f3w": 8, "gdzie": 8, "opowiadaj\u0105": 8, "tym": 8, "jak": 8, "zostali": 8, "zaatakowani": 8, "surprisingli": 8, "shelf": 8, "unsettl": 8, "paradox": 8, "harbor": 8, "wisdom": 8, "aspir": 8, "technologist": 8, "disciplinari": 8, "ethicist": 8, "policymak": 8, "asa24": 8, "jide": 8, "jona": 8, "schuett": 8, "marku": 8, "anderljung": 8, "08751": 8, "bhy": 8, "hinton": 8, "pieter": 8, "abbeel": 8, "trevor": 8, "darrel": 8, "yuval": 8, "harari": 8, "ya": 8, "lan": 8, "shai": 8, "shalev": 8, "gillian": 8, "hadfield": 8, "clune": 8, "tegan": 8, "maharaj": 8, "hutter": 8, "at\u0131l\u0131m": 8, "g\u00fcne\u015f": 8, "baydin": 8, "sheila": 8, "mcilraith": 8, "qiqi": 8, "ashwin": 8, "acharya": 8, "anca": 8, "dragan": 8, "philip": 8, "torr": 8, "kahneman": 8, "s\u00f6ren": 8, "mindermann": 8, "amid": 8, "6698": 8, "1126": 8, "adn0117": 8, "bbc": 8, "emili": 8, "braca": 8, "israel": 8, "carter": 8, "hafsa": 8, "kanchwala": 8, "khojasteh": 8, "charli": 8, "landow": 8, "luo": 8, "magarelli": 8, "mirin": 8, "averi": 8, "moyer": 8, "kayla": 8, "simpson": 8, "amelia": 8, "skawinski": 8, "heverin": 8, "23308": 8, "bmc": 8, "dillon": 8, "brendan": 8, "murphi": 8, "khachaturov": 8, "gleav": 8, "kellin": 8, "pelrin": 8, "2408": [8, 9], "02946": 8, "cmm": 8, "lorenzo": 8, "malandri": 8, "mercorio": 8, "navid": 8, "nobani": 8, "seveso": 8, "15248": 8, "edg24": 8, "exa24": 8, "cyber": 8, "grb": 8, "rossi": 8, "barrow": 8, "mehrab": 8, "tanjim": 8, "sungchul": 8, "franck": 8, "dernoncourt": 8, "ruiyi": 8, "nesreen": 8, "00770": 8, "h44z": 8, "hgp": 8, "saadia": 8, "hamid": 8, "palangi": 8, "dipankar": 8, "ec": 8, "kamar": 8, "oxi": 8, "smaranda": 8, "muresan": 8, "preslav": 8, "nakov": 8, "alin": 8, "villavicencio": 8, "60th": 8, "3309": 8, "3326": 8, "dublin": 8, "hym": 8, "weijiang": 8, "weitao": 8, "weihong": 8, "zhangyin": 8, "haotian": 8, "qianglong": 8, "weihua": 8, "xiaocheng": 8, "bing": 8, "dx": 8, "3703155": 8, "iuc": 8, "kartikeya": 8, "upasani": 8, "jianfeng": 8, "krithika": 8, "tontchev": 8, "2312": 8, "06674": 8, "ldw": 8, "lijun": 8, "ruohui": 8, "xuhao": 8, "wangmeng": 8, "zuo": 8, "dahua": 8, "qiao": 8, "shao": 8, "05044": 8, "mpy": 8, "xuwang": 8, "zifan": 8, "norman": 8, "mu": 8, "elham": 8, "sakhae": 8, "nathaniel": 8, "forsyth": 8, "04249": 8, "ma24": 8, "mlc24": 8, "illumin": 8, "ailumin": 8, "oaa": 8, "adler": 8, "ahmad": 8, "ilg": 8, "akkaya": 8, "florencia": 8, "leoni": 8, "aleman": 8, "janko": 8, "altenschmidt": 8, "altman": 8, "shyamal": 8, "anadkat": 8, "avila": 8, "valeri": 8, "balcom": 8, "baltescu": 8, "haim": 8, "belgum": 8, "irwan": 8, "bello": 8, "jake": 8, "berdin": 8, "bernadett": 8, "shapiro": 8, "berner": 8, "lenni": 8, "bogdonoff": 8, "boiko": 8, "madelain": 8, "boyd": 8, "luisa": 8, "brakman": 8, "button": 8, "rosi": 8, "campbel": 8, "cann": 8, "brittani": 8, "carei": 8, "carlson": 8, "rori": 8, "carmichael": 8, "che": 8, "foti": 8, "sulli": 8, "rubi": 8, "chess": 8, "chester": 8, "cho": 8, "hyung": 8, "won": 8, "chung": 8, "jeremiah": 8, "currier": 8, "yunx": 8, "cori": 8, "decareaux": 8, "degri": 8, "deutsch": 8, "devil": 8, "dhar": 8, "dowl": 8, "dun": 8, "adrien": 8, "ecoffet": 8, "atti": 8, "eleti": 8, "tyna": 8, "elound": 8, "farhi": 8, "niko": 8, "sim\u00f3n": 8, "posada": 8, "fishman": 8, "juston": 8, "isabella": 8, "fulford": 8, "georg": 8, "gibson": 8, "vik": 8, "tarun": 8, "gogineni": 8, "goh": 8, "rapha": 8, "gontijo": 8, "lope": 8, "gordon": 8, "morgan": 8, "grafstein": 8, "yufei": 8, "hallaci": 8, "heaton": 8, "johann": 8, "heideck": 8, "hickei": 8, "wade": 8, "hoeschel": 8, "houghton": 8, "kenni": 8, "hsu": 8, "shengli": 8, "joost": 8, "huizinga": 8, "shawn": 8, "joann": 8, "jang": 8, "roger": 8, "haozhun": 8, "shino": 8, "jomoto": 8, "billi": 8, "jonn": 8, "tomer": 8, "kaftan": 8, "\u0142ukasz": 8, "kamali": 8, "ingmar": 8, "kanitscheid": 8, "tabarak": 8, "khan": 8, "logan": 8, "kilpatrick": 8, "jong": 8, "wook": 8, "christina": 8, "yongjik": 8, "hendrik": 8, "kirchner": 8, "kiro": 8, "matt": 8, "kokotajlo": 8, "kondraciuk": 8, "kondrich": 8, "konstantinidi": 8, "kosic": 8, "vishal": 8, "kuo": 8, "lamp": 8, "ikai": 8, "teddi": 8, "jade": 8, "leung": 8, "chak": 8, "lim": 8, "molli": 8, "mateusz": 8, "litwin": 8, "theresa": 8, "lopez": 8, "patricia": 8, "lue": 8, "makanju": 8, "malfacini": 8, "markov": 8, "yaniv": 8, "markovski": 8, "bianca": 8, "mayn": 8, "mckinnei": 8, "christin": 8, "mcleavei": 8, "mcmillan": 8, "mcneil": 8, "aalok": 8, "menick": 8, "mishchenko": 8, "vinni": 8, "monaco": 8, "murk": 8, "m\u00e9ly": 8, "ashvin": 8, "nair": 8, "reiichiro": 8, "nakano": 8, "rajeev": 8, "nayak": 8, "arvind": 8, "neelakantan": 8, "hyeonwoo": 8, "noh": 8, "keef": 8, "jakub": 8, "pachocki": 8, "palermo": 8, "ashlei": 8, "pantuliano": 8, "parish": 8, "emi": 8, "parparita": 8, "passo": 8, "perelman": 8, "belbut": 8, "pere": 8, "pokorni": 8, "pokrass": 8, "vitchyr": 8, "pong": 8, "tolli": 8, "powel": 8, "bori": 8, "proehl": 8, "rae": 8, "ramesh": 8, "franci": 8, "kendra": 8, "rimbach": 8, "carl": 8, "rotst": 8, "roussez": 8, "saltarelli": 8, "ted": 8, "sander": 8, "schnurr": 8, "selsam": 8, "kyla": 8, "sheppard": 8, "toki": 8, "sherbakov": 8, "shieh": 8, "shoker": 8, "pranav": 8, "szymon": 8, "sidor": 8, "sigler": 8, "sitkin": 8, "sokolowski": 8, "natali": 8, "staudach": 8, "madelein": 8, "phil": 8, "tootoonchian": 8, "tseng": 8, "preston": 8, "tuggl": 8, "turlei": 8, "juan": 8, "cer\u00f3n": 8, "urib": 8, "vallon": 8, "vijayvergiya": 8, "alvin": 8, "ward": 8, "cj": 8, "weinmann": 8, "akila": 8, "welihinda": 8, "jiayi": 8, "weng": 8, "lilian": 8, "wiethoff": 8, "willner": 8, "wolrich": 8, "lauren": 8, "workman": 8, "sherwin": 8, "yoo": 8, "zeller": 8, "shengjia": 8, "juntang": 8, "zhuk": 8, "2303": 8, "08774": 8, "pnc": 8, "inkit": 8, "manish": 8, "nagireddi": 8, "giandomenico": 8, "cornacchia": 8, "subhajit": 8, "chaudhuri": 8, "tejaswini": 8, "pedapati": 8, "pierr": 8, "dognin": 8, "keerthiram": 8, "murugesan": 8, "miehl": 8, "santill\u00e1n": 8, "kieran": 8, "giulio": 8, "zizzo": 8, "muhammad": 8, "zaid": 8, "hame": 8, "purcel": 8, "desmond": 8, "zahra": 8, "ashktorab": 8, "ing": 8, "vejsbjerg": 8, "dali": 8, "hind": 8, "werner": 8, "geyer": 8, "ambrish": 8, "rawat": 8, "kush": 8, "varshnei": 8, "prasanna": 8, "sattigeri": 8, "07724": 8, "pcz": 8, "shern": 8, "woodsid": 8, "hanlin": 8, "emmon": 8, "justifi": 8, "machiavelli": 8, "2304": 8, "03279": 8, "saffron": 8, "ring": 8, "aslanid": 8, "glaes": 8, "nat": 8, "mcalees": 8, "irv": 8, "2202": 8, "03286": 8, "sjls22": 8, "lingfeng": 8, "haiyun": 8, "lemao": 8, "backdoor": 8, "02993": 8, "szw": 8, "qinghua": 8, "higham": 8, "gorban": 8, "bastouni": 8, "ivan": 8, "tyukin": 8, "12670": 8, "vsk": 8, "simplesafetytest": 8, "2311": 8, "08370": 8, "wmr24": 8, "sandra": 8, "brent": 8, "mittelstadt": 8, "duti": 8, "royal": 8, "240197": 8, "royalsocietypublish": 8, "1098": 8, "rso": 8, "wcp": 8, "boxin": 8, "weixin": 8, "hengzhi": 8, "chulin": 8, "mintong": 8, "kang": 8, "chenhui": 8, "chejian": 8, "zidi": 8, "xiong": [8, 9], "ritik": 8, "truong": 8, "simran": 8, "arora": 8, "zinan": 8, "decodingtrust": 8, "11698": 8, "ylx24": 8, "jiahao": 8, "xingwei": 8, "zyi": 8, "shune": 8, "lyumanshan": 8, "jingyu": 8, "shui": 8, "haobin": 8, "pengfei": 8, "hewu": 8, "ghost": 8, "14931": 8, "zho24": 8, "amazonwservices24": 8, "anthropic24": 8, "cdn": 8, "1adf000c8f675958c2ee23805d91aaade1cd4613": 8, "centerfasafety24a": 8, "centerforaisafeti": 8, "centerfasafety24b": 8, "deepmind24": 8, "googleapi": 8, "fsf": 8, "europeanmagency24": 8, "ema": 8, "europa": 8, "activities_en": 8, "financialirauthority24": 8, "harmbench24": 8, "ibm24": 8, "watsonx": 8, "saa": 8, "libraryocongress23": 8, "loc": 8, "mistralai24": 8, "mlsteam24": 8, "mlsafeti": 8, "nationaliosatechnology24": 8, "nist": 8, "itl": 8, "nvidia24": 8, "openai24a": 8, "openai24b": 8, "opensafetylab24a": 8, "opensafetylab24b": 8, "protectai24": 8, "surgeai24": 8, "ukgovernment24": 8, "unicef24": 8, "innocenti": 8, "julia": 9, "easili": 9, "trial": 9, "wrangl": 9, "hoc": 9, "unwant": 9, "overflow": 9, "twitter": 9, "youtub": 9, "ldot": 9, "prod_": 9, "syntact": 9, "xml": 9, "invalid": 9, "delic": 9, "heart": 9, "ttt": 9, "itt": 9, "po": 9, "nousresearch": 9, "herm": 9, "person1": 9, "q1": 9, "person2": 9, "json_format": 9, "response_cont": 9, "is_json": 9, "myjson": 9, "nest": 9, "conceptu": 9, "unend": 9, "whitespac": 9, "throw": 9, "somewher": 9, "json_object": 9, "circul": 9, "vertex": 9, "went": 9, "secextract": 9, "mentioned_ent": 9, "mentioned_plac": 9, "extract_from_sec_fil": 9, "sec_filing_text": 9, "hint": 9, "prompt_extract": 9, "sec_extract": 9, "washington": 9, "beg": 9, "unnorm": 9, "0325": 9, "strongest": 9, "greedi": 9, "bfloat16": 9, "device_map": 9, "src": 9, "python3": 9, "nvml": 9, "return_tensor": 9, "pt": 9, "inference_mod": 9, "last_token_logit": 9, "next_token_prob": 9, "nn": 9, "dim": 9, "top_k_prob": 9, "top_k_indic": 9, "top_k_token": 9, "decod": 9, "idx": 9, "skip_special_token": 9, "prob": 9, "0305": 9, "0197": 9, "0106": 9, "0093": 9, "logitsprocessor": 9, "logits_processor": 9, "logitsprocessorlist": 9, "customlogitsprocessor": 9, "intermediari": 9, "input_id": 9, "__call__": 9, "longtensor": 9, "batch_siz": 9, "sequence_length": 9, "floattensor": 9, "vocab_s": 9, "mask": 9, "pick": 9, "yesnologitsprocessor": 9, "initial_length": 9, "fill_": 9, "inf": 9, "debug": 9, "yes_token": 9, "add_special_token": 9, "no_token": 9, "yes_no_logit": 9, "yes_no_prob": 9, "yes_prob": 9, "no_prob": 9, "yes_mask": 9, "1e4": 9, "NO": 9, "generation_output_control": 9, "uncontrol": 9, "generation_output": 9, "4263": 9, "5737": 9, "10407": 9, "4607": 9, "6250": 9, "9219": 9, "helper": 9, "model_output": 9, "gen_output": 9, "batch_decod": 9, "clean_up_tokenization_spac": 9, "classic": 9, "italian": 9, "willard": 9, "louf": 9, "reformul": 9, "finit": 9, "fsm": 9, "s_": 9, "s_t": 9, "s_1": 9, "tild": 9, "odot": 9, "rightarrow": 9, "thien": 9, "automaton": 9, "dfa": 9, "outgo": 9, "renorm": 9, "yy": 9, "ever": 9, "aa": 9, "lwai": 9, "prop": 9, "yynnaa": 9, "malform": 9, "sec_extraction_outlin": 9, "zsp": 9, "zicorp": 9, "with_structured_output": 9, "runnabl": 9, "typeddict": 9, "qu": 9, "langchain_openai": 9, "chatopenai": 9, "chatprompttempl": 9, "extract_from_sec_filing_langchain": 9, "structured_llm": 9, "from_messag": 9, "sec_extraction_langchain": 9, "bnf": 9, "backu": 9, "naur": 9, "fssl": 9, "extract_entities_from_sec_fil": 9, "ollama_structured_output_prompt_suffix": 9, "ollama_structured_output_temperatur": 9, "uncensor": 9, "model_json_schema": 9, "response_json": 9, "sharpli": 9, "exllama2": 9, "zoo": 9, "nonetheless": 9, "extran": 9, "dispar": 9, "preval": 9, "peer": 9, "speak": 9, "aider": 9, "outweigh": 9, "rebutt": 9, "dottxt": 9, "reproduct": 9, "paint": 9, "flaw": 9, "uneven": 9, "conflat": 9, "drawback": 9, "pfiffer": 9, "wrestl": 9, "aid24": 9, "dot24": 9, "demo": 9, "gge24": 9, "lan4b": 9, "lww": 9, "xun": 9, "hanyu": 9, "yezhaohui": 9, "shichao": 9, "simin": 9, "shunyu": 9, "feiyu": 9, "zhiyu": 9, "12599": 9, "llf": 9, "xieyang": 9, "frederick": 9, "fiannaca": 9, "terri": 9, "koo": 9, "dixon": 9, "ea": 9, "3613905": 9, "3650756": 9, "xuan": 9, "hai": 9, "nguyen": 9, "ngoc": 9, "tiviati": 9, "hieu": 9, "dao": 9, "shafiq": 9, "joti": 9, "kenji": 9, "kawaguchi": 9, "nanci": 9, "min": 9, "kan": 9, "08656": 9, "nou24": 9, "out24": 9, "twt": 9, "zhi": 9, "kuang": 9, "tsai": 9, "chieh": 9, "hung": 9, "nung": 9, "02442": 9, "tt24": 9, "vivien": 9, "vivien000": 9, "wl23": 9, "r\u00e9mi": 9, "09702": 9, "guidanceai24": 9, "nvidia4a": 9, "wikipediacontributors24": 9, "wiktionari": 9, "naur_form": 9}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8, 9], "core": 0, "challeng": [0, 6], "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4, 6], "practic": [0, 2, 7, 9], "approach": [0, 4, 8], "an": 0, "open": [0, 2, 7], "sourc": [0, 2, 7], "note": [0, 3, 6], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3, 6], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 7], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 8], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 8], "issu": 0, "author": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6, 7, 8], "guid": 2, "pitfal": [2, 8], "softwar": [2, 5], "chapter": 2, "1": [2, 8], "The": [2, 4, 5, 7], "eval": [2, 5, 8], "gap": [2, 5], "2": [2, 7, 8], "structur": [2, 6, 9], "output": [2, 9], "3": [2, 8], "manag": [2, 6], "input": [2, 6], "data": [2, 3, 6], "4": [2, 8], "safeti": [2, 8], "5": [2, 8], "prefer": [2, 3], "base": [2, 3, 5, 6, 8], "align": [2, 3], "6": [2, 8], "local": [2, 7], "7": 2, "fall": [2, 4], "cost": [2, 4, 7], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 7, 8, 9], "resourc": 2, "introduct": [3, 5, 6, 7, 8, 9], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 7], "human": 3, "supervis": 3, "fine": [3, 7, 9], "tune": [3, 7, 9], "sft": 3, "augment": [3, 6], "post": [3, 9], "train": 3, "answer": 3, "limit": [3, 6], "collaps": 3, "fake": 3, "case": [3, 6, 7, 8], "studi": [3, 6, 7, 8], "polici": [3, 8], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 7, 8], "synthet": 3, "gener": [3, 5, 6, 8], "user": [3, 8], "prompt": [3, 7, 9], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": [3, 6], "vibe": 3, "check": [3, 4], "evalu": [3, 5, 8], "discuss": [3, 6, 9], "conclus": [3, 4, 5, 6, 7, 8, 9], "refer": [3, 4, 5, 6, 7, 8, 9], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 7], "oper": 4, "technic": [4, 8], "quantiz": [4, 7], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 9], "statement": [5, 9], "tradit": 5, "v": [5, 7], "design": [5, 8], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 7], "benchmark": [5, 7, 8], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 8], "famili": [5, 7], "us": [5, 6], "langsmith": 5, "promptfoo": 5, "comparison": [5, 7, 9], "pars": 6, "document": 6, "markitdown": 6, "docl": 6, "extract": 6, "retriev": 6, "rag": 6, "pipelin": 6, "knowledg": 6, "vector": 6, "databas": 6, "rerank": 6, "Will": 6, "exist": [6, 8], "futur": 6, "framework": [6, 8, 9], "chunk": 6, "contextu": 6, "link": 6, "long": 6, "form": 6, "ii": 6, "quiz": 6, "citat": 6, "implement": [6, 8], "exampl": 6, "usag": 6, "choos": 7, "suitabl": 7, "result": 7, "llama": 7, "licens": 7, "commun": 7, "support": 7, "custom": [7, 8], "mistral": [7, 8], "decemb": 7, "22": 7, "2024": 7, "deploy": 7, "serv": 7, "cpp": 7, "llamafil": 7, "ollama": [7, 9], "lama": 7, "ui": 7, "lm": 7, "studio": 7, "jan": 7, "webui": 7, "openwebui": 7, "effect": 7, "level": 7, "hardwar": 7, "takeawai": [7, 8], "risk": 8, "ai": 8, "amplifi": 8, "harm": 8, "novel": 8, "associ": 8, "autonom": 8, "exacerb": 8, "factor": 8, "specif": 8, "guidanc": 8, "govern": 8, "organ": 8, "privat": 8, "sector": 8, "openai": 8, "anthrop": 8, "googl": 8, "rubric": 8, "mlcommon": 8, "centr": 8, "porquoi": 8, "red": 8, "team": 8, "constitut": 8, "explain": 8, "xai": 8, "plan": 8, "phase": 8, "definit": 8, "research": [8, 9], "identif": 8, "architectur": 8, "select": 8, "go": 8, "market": 8, "compon": 8, "salad": 8, "bench": 8, "truthfulqa": 8, "harmbench": 8, "safebench": 8, "techniqu": [8, 9], "repres": 8, "layer": 8, "map": 8, "rule": 8, "filter": 8, "moder": 8, "bad": 8, "good": 8, "guard": 8, "judg": 8, "valid": 8, "engin": 9, "json": 9, "mode": 9, "logit": 9, "process": 9, "outlin": 9, "langchain": 9, "best": 9, "compar": 9, "solut": 9, "ongo": 9, "debat": 9, "acknowledg": 9}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"], [9, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author": [[0, "about-the-author"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Structured Output": [[2, "chapter-2-structured-output"]], "Chapter 3: Managing Input Data": [[2, "chapter-3-managing-input-data"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"], [9, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [7, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"], [9, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [9, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [9, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [7, "comparison"], [7, "id37"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Managing Input Data": [[6, "managing-input-data"]], "Parsing Documents": [[6, "parsing-documents"]], "MarkItDown": [[6, "markitdown"]], "Docling": [[6, "docling"]], "Structured Data Extraction": [[6, "structured-data-extraction"]], "Retrieval-Augmented Generation": [[6, "retrieval-augmented-generation"]], "RAG Pipeline": [[6, "rag-pipeline"]], "Preparing the Knowledge Base": [[6, "preparing-the-knowledge-base"]], "Vector Database": [[6, "vector-database"]], "Reranking": [[6, "reranking"]], "LLMs with RAG": [[6, "llms-with-rag"]], "Challenges and Limitations": [[6, "challenges-and-limitations"]], "Will RAGs exist in the future?": [[6, "will-rags-exist-in-the-future"]], "A Note on Frameworks": [[6, "a-note-on-frameworks"]], "Case Studies": [[6, "case-studies"]], "Case Study I: Content Chunking with Contextual Linking": [[6, "case-study-i-content-chunking-with-contextual-linking"]], "Generating long-form content": [[6, "generating-long-form-content"]], "Discussion": [[6, "discussion"], [6, "id39"], [9, "discussion"]], "Case Study II: Quiz Generation with Citations": [[6, "case-study-ii-quiz-generation-with-citations"]], "Use Case": [[6, "use-case"]], "Implementation": [[6, "implementation"]], "Example Usage": [[6, "example-usage"]], "Local LLMs in Practice": [[7, "local-llms-in-practice"]], "Choosing your Model": [[7, "choosing-your-model"]], "Task Suitability": [[7, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[7, "llama2-benchmark"]], "Performance & Cost": [[7, "performance-cost"]], "Licensing": [[7, "licensing"]], "Open Source LLMs.": [[7, "open-source-llms"]], "Community Support": [[7, "community-support"]], "Customization": [[7, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[7, "mistral-costs"]], "Tools for Local LLM Deployment": [[7, "tools-for-local-llm-deployment"]], "Serving Models": [[7, "serving-models"]], "LLama.cpp": [[7, "llama-cpp"]], "Llamafile": [[7, "llamafile"]], "Ollama": [[7, "ollama"], [9, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[7, "feature-comparison-local"]], "UI": [[7, "ui"]], "LM Studio": [[7, "lm-studio"]], "Jan": [[7, "jan"]], "Open WebUI": [[7, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[7, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[7, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[7, "prompts-dataset"]], "Quantization Levels": [[7, "quantization-levels"]], "Benchmarking": [[7, "benchmarking"], [8, "benchmarking"]], "Results": [[7, "results"]], "Quantization Benchmarks": [[7, "quantization-benchmarks"]], "Benchmarking Hardware": [[7, "benchmarking-hardware"]], "Takeaways": [[7, "takeaways"], [8, "takeaways"]], "Safety": [[8, "safety"]], "Safety Risks": [[8, "safety-risks"]], "General AI Safety Risks": [[8, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[8, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[8, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[8, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[8, "llms-specific-safety-risks"]], "Guidance": [[8, "guidance"]], "Governments & Organizations": [[8, "governments-organizations"]], "Private Sector": [[8, "private-sector"]], "OpenAI": [[8, "openai"]], "Anthropic": [[8, "anthropic"]], "Google": [[8, "google"]], "Rubrics": [[8, "rubrics"]], "MLCommons AI Safety Benchmark": [[8, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[8, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[8, "porquoi"]], "Approaches": [[8, "approaches"]], "Red Teaming": [[8, "red-teaming"]], "Constitutional AI": [[8, "constitutional-ai"]], "Explainable AI (XAI)": [[8, "explainable-ai-xai"]], "Designing a Safety Plan": [[8, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[8, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[8, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[8, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[8, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[8, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[8, "phase-6-go-to-market"]], "Common Pitfalls": [[8, "common-pitfalls"]], "Technical Implementation Components": [[8, "technical-implementation-components"]], "Benchmarks & Datasets": [[8, "benchmarks-datasets"]], "SALAD-Bench": [[8, "salad-bench"]], "TruthfulQA": [[8, "truthfulqa"]], "HarmBench": [[8, "harmbench"]], "SafeBench": [[8, "safebench"]], "Tools & Techniques": [[8, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[8, "safety-layer-table"]], "Rules-Based Safety Filtering": [[8, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[8, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[8, "llm-based-safety-filtering"]], "Custom Moderation": [[8, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[8, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[8, "evals-dataset"]], "Bad Samples": [[8, "bad-samples"]], "Good Samples": [[8, "good-samples"]], "Safety Filters": [[8, "safety-filters"]], "LLM-Guard": [[8, "llm-guard"]], "Mistral Moderation API": [[8, "mistral-moderation-api"]], "OpenAI Moderation API": [[8, "openai-moderation-api"]], "Custom Judge Validator": [[8, "custom-judge-validator"]], "Structured Output": [[9, "structured-output"]], "Techniques": [[9, "techniques"]], "Prompt Engineering": [[9, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[9, "json-mode-fine-tuned"]], "Logit Post-Processing": [[9, "logit-post-processing"]], "Outlines": [[9, "outlines"]], "LangChain": [[9, "langchain"]], "Best Practices": [[9, "best-practices"]], "Comparing Solutions": [[9, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[9, "structured-output-frameworks"]], "Research and Ongoing Debate": [[9, "research-and-ongoing-debate"]], "Acknowledgements": [[9, "acknowledgements"]]}, "indexentries": {}}) \ No newline at end of file diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb index e2c83b2..a2a1ee3 100644 --- a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb +++ b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "35601299", + "id": "4bbf1402", "metadata": {}, "source": [ "(intro)=\n", @@ -31,11 +31,15 @@ "\n", "3. **Testing Complexity**: Traditional software testing methodologies break down when dealing with non-deterministic and generative systems, requiring new approaches.\n", "\n", - "4. **Safety and Alignment**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment.\n", + "4. **Safety**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment.\n", "\n", - "5. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions.\n", + "5. **Alignment**: LLMs are next-token prediction models, which means they are not aligned with the user's preferences by default.\n", "\n", - "6. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization.\n", + "6. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions.\n", + "\n", + "7. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization.\n", + "\n", + "We conclude with a discussion on the future of LLMs and the challenges that will arise as we move forward.\n", "\n", "\n", "## A Practical Approach\n", @@ -167,7 +171,7 @@ "\n", "## About the Author\n", "\n", - "Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI1 ecosystem.\n", + "Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI ecosystem.\n", "\n", "With over 15 years of experience delivering technology products across startups and Fortune 500 companies, he is also an author of numerous scholarly publications and a frequent speaker at academic and business conferences. Grounded on academic background and drawing from practical experience building and scaling up products powered by language models at early-stage startups, major institutions as well as contributing to open source projects, he brings a unique perspective on bridging the gap between LLMs promised potential and their practical implementation challenges to enable the next generation of AI-powered products." ] diff --git a/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb index af0f848..9f62182 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/cost.ipynb @@ -315,7 +315,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "Quantization[^visual-quantization] is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^visual-quantization]: Maarten Grootendorst provides the best visual guide for model quantization {cite}`grootendorst2024quantization`.\n", "\n", "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", "\n", diff --git a/tamingllms/_build/jupyter_execute/notebooks/evals.ipynb b/tamingllms/_build/jupyter_execute/notebooks/evals.ipynb index 150868e..cad1768 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/evals.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/evals.ipynb @@ -853,7 +853,7 @@ "4. **Run Evaluations**: Use the judge model to score outputs. Consider using a large and/or more capable model as a judge to provide more nuanced assessments.\n", "5. **Aggregate and Analyze Results**: Interpret scores to refine applications.\n", "\n", - "```{figure} ../_static/evals/llm_judge.svg\n", + "```{figure} ../_static/evals/llm_judge.png\n", "---\n", "name: llm_judge\n", "alt: Conceptual Overview\n", @@ -1187,11 +1187,11 @@ "\n", "An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is [Judge Arena](https://judgearena.com/) {cite}`judgearena2024`, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in {numref}`meta2`. Only after submitting a vote, users can see which models were actually doing the judging.\n", "\n", - "```{figure} ../_static/evals/meta2.svg\n", + "```{figure} ../_static/evals/meta2.png\n", "---\n", "name: meta2\n", "alt: Human-in-the-loop meta evaluation Conceptual Overview\n", - "scale: 60%\n", + "scale: 75%\n", "align: center\n", "---\n", "Human-in-the-loop Meta Evaluation.\n", diff --git a/tamingllms/_build/jupyter_execute/notebooks/input.ipynb b/tamingllms/_build/jupyter_execute/notebooks/input.ipynb index c2cb84f..9656c0c 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/input.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/input.ipynb @@ -12,11 +12,6 @@ "-- Steve Jobs\n", "```\n", "```{contents}\n", - "```\n", - "\n", - "\n", - "```{note}\n", - "This Chapter is Work-in-Progress.\n", "```" ] }, @@ -26,20 +21,22 @@ "source": [ "## Introduction\n", "\n", - "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n", + "While advances in long-context language models (LCs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process, significant challenges remain in managing and effectively utilizing extended data inputs:\n", "\n", - "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", + "- LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`.\n", + "- They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`.\n", + "- LLMs also face \"lost-in-the-middle\" problems {cite}`wu2024longdocumentsummaryevaluation` and struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", "\n", - "Motivated by these challenges, this chapter explores two key components:\n", + "Motivated by these challenges, this chapter explores two key input data components:\n", "\n", - "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n", + "1. Data Parsing and Chunking: Parsing and chunking documents into a unified format that is suitable and more manageable for LLMs to process.\n", "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n", "\n", "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n", "\n", - "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n", + "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation) while discussing whether RAGs will be really needed in the future given the rise of long-context language models.\n", "\n", - "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n", + "While RAGs are useful for incorporating external context, they are not a silver bullet nor a mandatory component for all LLM applications. In our last case study, we leverage long-context windows to build a quiz generator from a large knowledge base. We will also explore some additional relevant techniques such as prompt caching and response verification through citations.\n", "\n", "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases." ] @@ -50,9 +47,11 @@ "source": [ "## Parsing Documents\n", "\n", - "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n", + "Data parsing and formatting play a critical role in LLMs performance {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`. Hence, building robust data ingestion and preprocessing pipelines is essential for any LLM application. \n", + "\n", + "This section explores open source tools that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the LLM performance.\n", "\n", - "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance." + "We will cover open source tools that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs demonstrating how the quality of the parser can impact LLM's performance." ] }, { @@ -61,7 +60,7 @@ "source": [ "### MarkItDown\n", "\n", - "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", + "MarkItDown {cite}`microsoft2024markitdown` is a Python package and CLI tool developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", "\n", "Key features:\n", "- Simple command-line and Python API interfaces\n", @@ -81,7 +80,7 @@ "\n", "### Docling\n", "\n", - "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", + "Docling {cite}`docling2024github` is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", "\n", "Key features:\n", "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n", @@ -101,13 +100,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Frameworks-Based Parsing\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -119,17 +111,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n", + "A common use case where document parsing matters is structured data extraction, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`).\n", "\n", "\n", "```{figure} ../data/input/forecast.png\n", "---\n", "name: forecast\n", "alt: Forecast\n", - "scale: 50%\n", + "scale: 45%\n", "align: center\n", "---\n", - "Forecast\n", + "Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`\n", "```" ] }, @@ -184,7 +176,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence." + "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of similarity between two strings based on the number of matches in the longest common subsequence." ] }, { @@ -256,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively." + "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher`, respectively." ] }, { @@ -351,7 +343,7 @@ "scale: 45%\n", "align: center\n", "---\n", - "Forecast 2025\n", + "Merrill Lynch's CIO Economic Forecasts.\n", "```\n", "\n", "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n" @@ -375,7 +367,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." + "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` represents the kind of data the user would like to extract and `doc` is the input document to analyze." ] }, { @@ -682,7 +674,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure as we can see in {ref}`asset_class`. The CIO view information is represented in a spectrum starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this relatively more complex information from the document.\n", "```{figure} ../_static/input/asset_class.png\n", "---\n", "name: asset_class\n", @@ -729,7 +721,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." + "We construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart. This enables us to calculate accuracy of the structured data extraction task in case." ] }, { @@ -936,7 +928,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." + "We observe that Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract structured information. Having said that, it is important to mention that a more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates since results are non-deterministic." ] }, { @@ -945,8 +937,8 @@ "source": [ "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", "\n", - "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", - "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" + "By doing that, we observe that Docling extracted 7 tables from the document exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we display the first two and the last tables. We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" ] }, { @@ -1593,7 +1585,14 @@ "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", "\n", - "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image.\n", + "\n", + "We have covered MarkitDown and Docling as examples of open source tools that can help developers parse input data into a suitable format to LLMs. Other relevant open source tools worth mentioning include:\n", + "- Unstructured.io {cite}`unstructured2024github`: A Python library for unstructured data extraction.\n", + "- FireCrawl {cite}`mendable2024firecrawl`: A Fast and Efficient Web Crawler for LLM Training Data.\n", + "- LlamaParse {cite}`llamaparse2024github`: Llamaindex's data parsing solution.\n", + "\n", + "The choice of tool depends on the specific requirements of the application and the nature of the input data. This choice should be taken as a critical decision of any data intensive LLM-based application and deserves dedicated research and evidence-based experimentation.\n" ] }, { @@ -1602,75 +1601,152 @@ "source": [ "## Retrieval-Augmented Generation\n", "\n", - "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", + "What happens if we asked ChatGPT who's the author of the book \"Taming LLMs\"?\n", "\n", - "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 1, "metadata": {}, + "outputs": [], "source": [ - "## Case Studies\n", - "\n", - "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n", - "\n", - "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "from dotenv import load_dotenv\n", + "import os\n", "\n", - "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", "\n", - "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "question = \"Who's the Author of the Book Taming LLMs?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The book \"Taming LLMs\" is authored by *G. Arulkumaran, H. M. B. P. D. Karthikeyan, and I. A. M. Almasri.* If you need more information about the book or its contents, feel free to ask!\n" + ] + } + ], + "source": [ + "response = client.chat.completions.parse(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": question}\n", + " ]\n", + ")\n", + "response.choices[0].message.content" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study I: Content Chunking with Contextual Linking\n", + "Turns out ChatGPT hallucinates. A quick web search on the before mentioned authors yields no results. In fact, those authors names are made up. And of course the correct answer would have been \"Tharsis Souza\".\n", "\n", - "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", - "1. The LLM's inability to process long inputs to do context-size limits\n", - "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", - "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "LLMs only have access to the information they have been trained on, which of course has been fixed at a point in time. Hence, LLMs operate with stale data. The problem gets exacerbated by the fact that LLMs are trained to provide an answer even if the answer is unknown by them, hence leading to hallucinations. \n", "\n", - "Here, we exemplify this technique by following these steps:\n", - "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "One solution to this problem is to use a retrieval system to fetch information from a knowledge base to provide recent and relevant context to user queries using so-called Retrieval Augmented Generation (RAG) system.\n", "\n", - "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "RAG utilizes a retrieval system to fetch external knowledge and augment LLM's context. It is a useful technique for building LLM applications that require domain-specific information or knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`. It has also proved effective in mitigating LLMs hallucinations {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above example, a RAG would help with hallucinations by grounding the LLM's response to information provided in the knowledge base. Additional common use cases of RAG systems include:\n", "\n", - "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "1. **Enterprise Knowledge Management**: RAG enables organizations to synthesize answers from diverse internal data sources like documents, databases, and communication channels. This creates a unified knowledge interface that can accurately answer questions using the organization's own data.\n", + "2. **Document Processing and Analysis**: RAG excels at extracting and analyzing information from complex documents like financial reports, presentations, and spreadsheets. The system can enable LLMs to understand context and relationships across different document types and formats.\n", + "3. **Intelligent Customer Support**: By combining knowledge bases with conversational abilities, RAG powers chatbots and support systems that can maintain context across chat history, provide accurate responses, and handle complex customer queries while reducing hallucinations.\n", + "4. **Domain-Specific Applications**: RAG allows LLMs to be equipped with specialized knowledge in fields like medicine, law, or engineering by retrieving information from domain-specific literature, regulations, and technical documentation. This enables accurate responses aligned with professional standards and current best practices.\n", + "5. **Code Documentation and Technical Support**: RAG can help developers by retrieving relevant code examples, API documentation, and best practices from repositories and documentation, which often suffer updates frequently, enabling more accurate and contextual coding assistance.\n", "\n", - "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "If LLMs alone work on stale, general-purpose data with the added challenge of being prone to hallucinations, RAG systems serve as an added capability enabling LLMs to work on recent, domain-specific knowledge increasing the likelihood of LLMs to provide responses that are factual and relevant to user queries.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RAG Pipeline\n", "\n", - "Let's examine an example implementation of this technique.\n", + "RAG architectures vary but they all share the same goal: to retrieve relevant information from a knowledge base to maximize the LLM's ability to effectively and accurately respond to prompts, particularly when the answer requires out-of-training data information.\n", "\n", - "#### Generating long-form content\n", + "We will introduce key components of a RAG system one by one leading to a full canonical RAG pipeline at the end that ultimately will be used to answer our original question \"Who's the author of the book Taming LLMs?\", accurately.\n", "\n", - "- Goal: Generate a long-form report analyzing a company's financial statement.\n", - "- Input: A company's 10K SEC filing.\n", + "The following basic components will be introduced (see {numref}`rag_pipeline` for a visual representation):\n", + "- Vector Database\n", + " - Embeddings\n", + " - Indexing\n", + "- Retrieval System including re-ranking\n", + "- LLM Augmented Generation via in-context learning\n", "\n", - "```{figure} ../_static/structured_output/diagram1.png\n", + "Data extraction, parsing and chunking are also part of a canonical pipeline as we prepare the knowledge base. Those are concepts that we have already explored in the previous sections, hence we will be succinct here. We will start by preparing the knowledge base.\n", + "\n", + "```{figure} ../_static/input/rag.svg\n", "---\n", - "name: content-chunking-with-contextual-linking\n", - "alt: Content Chunking with Contextual Linking\n", - "scale: 50%\n", + "name: rag_pipeline\n", + "alt: RAG Pipeline\n", + "scale: 99%\n", "align: center\n", "---\n", - "Content Chunking with Contextual Linking Schematic Representation.\n", - "```\n", - "\n", - "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "Simplified RAG Pipeline\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preparing the Knowledge Base\n", "\n", - "**Step 1: Chunking the Content**\n", + "Every RAG system requires a knowledge base. In our case, the knowledge base is a set of documents that we equip the LLM to answer our authorship question.\n", "\n", - "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", - "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", - "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", - " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", - " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", - " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "Hence, we will compose our knowledge base by adding the web version of (some of the chapters of) the book \"Taming LLMs\", namely:\n", + "- Introduction\n", + "- Structured Output\n", + "- Input (this very chapter)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book_url = \"https://www.tamingllms.com/\"\n", + "chapters = [\"markdown/intro.html\",\n", + " \"notebooks/structured_output.html\",\n", + " \"notebooks/input.html\"]\n", "\n", - " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + "chapter_urls = [f\"{book_url}/{chapter}\" for chapter in chapters]\n", + "chapter_ids = [chapter.split(\"/\")[-1].replace(\".html\", \"\") for chapter in chapters]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `Docling` to download the chapters from the web and parse them as markdown files." ] }, { @@ -1679,36 +1755,57 @@ "metadata": {}, "outputs": [], "source": [ - "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", - " \"\"\"\n", - " Split input text into chunks of specified size with specified overlap.\n", + "chapters = [converter.convert(chapter_url).document.export_to_markdown() for chapter_url in chapter_urls]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to store the chapters in a vector database to enable the construction of a retrieval system." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Vector Database\n", "\n", - " Args:\n", - " text (str): The input text to be chunked.\n", - " chunk_size (int): The maximum size of each chunk in tokens.\n", - " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "Vector databases are specialized databases designed to store and retrieve high-dimensional vectors, which are mathematical representations of data like text, images, or audio. These databases are optimized for similarity search operations, making them ideal for embeddings-based retrieval systems.\n", "\n", - " Returns:\n", - " list: A list of text chunks.\n", - " \"\"\"\n", - " from langchain_text_splitters import CharacterTextSplitter\n", + "A typical pipeline involving a vector database includes the following:\n", "\n", - " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", - " return text_splitter.split_text(text)\n" + "1. Input data is converted into \"documents\" forming a collection representing our knowledge base\n", + "2. Each document is converted into an embedding which are stored in the vector database\n", + "3. Embeddings are indexed in the vector database for efficient similarity search\n", + "4. The vector database is queried to retrieve the most relevant documents\n", + "5. The retrieved documents are used to answer questions\n", + "\n", + "Vector databases are not a mandatory component of RAG systems. In fact, we can use a simple list of strings to store the chapters (or their chunks) and then use the LLM to answer questions about the document. However, vector databases are useful for RAG applications as they enable:\n", + "- Fast similarity search for finding relevant context\n", + "- Efficient storage of document embeddings\n", + "- Scalable retrieval for large document collections\n", + "- Flexible querying with metadata filters\n", + "\n", + "In that way, RAG applications can be seen as a retrieval system that uses a vector database to store and retrieve embeddings of documents, which in turn are used to augment LLMs with contextually relevant information as we will see in the next sections.\n", + "\n", + "Here, we will use ChromaDB {cite}`chromadb2024docs` as an example of an open source vector database but key features and concepts we cover are applicable to other vector databases, in general.\n", + "\n", + "ChromaDB is a popular open-source vector database that offers:\n", + "- Efficient storage and retrieval of embeddings\n", + "- Support for metadata and filtering\n", + "- Easy integration with Python applications\n", + "- In-memory and persistent storage options\n", + "- Support for multiple distance metrics\n", + "\n", + "Other notable vector databases include Weaviate, FAISS, and Milvus." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 2: Writing the Base Prompt Template**\n", - "\n", - "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", - "- `role`: Defines the role or persona the model should assume.\n", - "- `context`: Provides the background information or context for the task.\n", - "- `instruction`: Specifies the task or action the model needs to perform.\n", - "- `input_text`: Contains the actual text input that the model will process.\n", - "- `requirements`: Lists any specific requirements or constraints for the output." + "In ChromaDB, we can create a vector database client as follows." ] }, { @@ -1717,26 +1814,17 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.prompts import PromptTemplate\n", - "def get_base_prompt_template() -> str:\n", - " \n", - " base_prompt = \"\"\"\n", - " ROLE: {role}\n", - " CONTEXT: {context}\n", - " INSTRUCTION: {instruction}\n", - " INPUT: {input}\n", - " REQUIREMENTS: {requirements}\n", - " \"\"\"\n", - " \n", - " prompt = PromptTemplate.from_template(base_prompt)\n", - " return prompt" + "import chromadb\n", + "chroma_client = chromadb.Client()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + "This will create a vector database in memory. We can also create a persistent vector database by specifying a path to a directory or alternatively by using a cloud-based vector database service like AWS, Azure or GCP. We will use a vector database in memory for this example.\n", + "\n", + "Next, we create a collection to store the embeddings of the chapters. And add our chapters as documents to the collection as follows." ] }, { @@ -1745,45 +1833,19 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_community.chat_models import ChatLiteLLM\n", + "collection = chroma_client.create_collection(name=\"taming_llms\")\n", "\n", - "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", - " \"\"\"\n", - " Returns an LLMChain instance using langchain.\n", - "\n", - " Args:\n", - " prompt_template (str): The prompt template to use.\n", - " model_name (str): The name of the model to use.\n", - " temperature (float): The temperature setting for the model.\n", - "\n", - " Returns:\n", - " llm_chain: An instance of the LLMChain.\n", - " \"\"\"\n", - " \n", - " from dotenv import load_dotenv\n", - " import os\n", - "\n", - " # Load environment variables from .env file\n", - " load_dotenv()\n", - " \n", - " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", - " llm = ChatLiteLLM(\n", - " model=model_name,\n", - " temperature=temperature,\n", - " api_key=os.environ[api_key_label],\n", - " )\n", - " llm_chain = prompt_template | llm | StrOutputParser()\n", - " return llm_chain" + "collection.add(\n", + " documents=chapters,\n", + " ids=chapter_ids\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 3: Constructing Dynamic Prompt Parameters**\n", - "\n", - "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + "We are ready to query the collection. We write a simple function that takes the collection, input query and number of retrieved results as argument and returns the retrieved documents." ] }, { @@ -1792,59 +1854,19 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Dict\n", - "def get_dynamic_prompt_params(prompt_params: Dict, \n", - " part_idx: int, \n", - " total_parts: int,\n", - " chat_context: str,\n", - " chunk: str) -> str:\n", - " \"\"\"\n", - " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", - " \n", - " Args:\n", - " prompt_params (Dict): Original prompt parameters\n", - " part_idx (int): Index of current conversation part\n", - " total_parts (int): Total number of conversation parts\n", - " chat_context (str): Chat context from previous parts\n", - " chunk (str): Current chunk of text to be processed\n", - " Returns:\n", - " str: Dynamically constructed prompt template with part-specific params\n", - " \"\"\"\n", - " dynamic_prompt_params = prompt_params.copy()\n", - " # saves the chat context from previous parts\n", - " dynamic_prompt_params[\"context\"] = chat_context\n", - " # saves the current chunk of text to be processed as input\n", - " dynamic_prompt_params[\"input\"] = chunk\n", - " \n", - " # Add part-specific instructions\n", - " if part_idx == 0: # Introduction part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the Introduction part of a long report.\n", - " Don't cover any topics yet, just define the scope of the report.\n", - " \"\"\"\n", - " elif part_idx == total_parts - 1: # Conclusion part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the last part of a long report. \n", - " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", - " \"\"\"\n", - " else: # Main analysis part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", - " For this part, analyze the below INPUT.\n", - " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", - " \"\"\"\n", - " \n", - " return dynamic_prompt_params" + "def query_collection(collection, query_text, n_results=3):\n", + " results = collection.query(\n", + " query_texts=[query_text],\n", + " n_results=n_results\n", + " )\n", + " return results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "**Step 4: Generating the Report**\n", - "\n", - "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + "We write a simple query, enquiring the purpose of the book." ] }, { @@ -1853,24 +1875,907 @@ "metadata": {}, "outputs": [], "source": [ - "def generate_report(input_content: str, llm_model_name: str, \n", - " role: str, requirements: str,\n", - " chunk_size: int, chunk_overlap: int) -> str:\n", - " # stores the parts of the report, each generated by an individual LLM call\n", - " report_parts = [] \n", - " # split the input content into chunks\n", - " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", - " # initialize the chat context with the input content\n", - " chat_context = input_content\n", - " # number of parts to be generated\n", - " num_parts = len(chunks)\n", - "\n", - " prompt_params = {\n", - " \"role\": role, # user-provided\n", - " \"context\": \"\", # dinamically updated per part\n", - " \"instruction\": \"\", # dynamically updated per part\n", - " \"input\": \"\", # dynamically updated per part\n", - " \"requirements\": requirements #user-priovided\n", + "q = \"What is the purpose of this book?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print([['intro', 'input', 'structured_output']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As response, we obtain an object that contains several attributes including:\n", + "- `documents`: The actual documents retrieved from the collection, i.e. the chapters \n", + "- `ids`: The ids of the documents retrieved from the collection\n", + "- `distances`: The distances of the documents to the query vector\n", + "\n", + "We can see that the chapters \"Introduction\", \"Input\" and \"Structured Output\" are retrieved from the collection ordered by their distance to the query vector.\n", + "\n", + "We observe that the Introduction chapter is the most relevant one as it ranks first, followed by the Input and Structured Output chapters. Indeed, the purpose of the book is included in the Introduction chapter demonstrating the retrieval system successfully retrieved the most relevant document to the input query, in this simple example.\n", + "\n", + "In order to understand how the retrieval system works and how the \"distance to the query vector\" is computed, we need to understand how the embeddings are created and how the documents are indexed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Embeddings**\n", + "\n", + "Embeddings are numerical representations of data (including text, images, audio, etc.) that capture meaning, allowing machines to process data quantitatively. Each embedding can be represented as a vector of floating-point numbers such that embedded data with similar meanings produce similar, i.e. close, vectors [^embeddings_definition].\n", + "\n", + "[^embeddings_definition]: Bengio et al. {cite}`bengio2014representationlearningreviewnew` provide serves as an excellent reference for representation learning in general including embeddings. OpenAI provides a good intro to Embeddings for developers {cite}`openai2024embeddings`\n", + "\n", + "For text data, small distances among embeddings suggest high semantic relatedness and large distances suggest low semantic relatedness among the embedded texts. HuggingFace provides a leaderboard of embeddings models {cite}`huggingface2024mteb`, which are ranked by in dimensions such as classification, clustering and reranking performance.\n", + "\n", + "Behind the scenes, ChromaDB is using the model `all-MiniLM-L6-v2` by default [^chroma_embeddings] to create embeddings for the input documents and the query (see {numref}`embedding`). This model is available in `sentence_transformers` {cite}`sentencetransformers2024website`. Let's see how it works.\n", + "\n", + "```{figure} ../_static/input/embedding.svg\n", + "---\n", + "name: embedding\n", + "alt: Embedding\n", + "scale: 70%\n", + "align: center\n", + "---\n", + "Embedding\n", + "```\n", + "\n", + "[^chroma_embeddings]: ChromaDB enables custom embedding functions and provides a list of wrappers around commonly used embedding models and APIs https://docs.trychroma.com/docs/embeddings/embedding-functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "embedding_model = SentenceTransformer('all-MiniLM-L6-v2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We replicate what ChromaDB did by embedding our chapters as well as input query using sentence transformers." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 384)\n" + ] + } + ], + "source": [ + "q = \"What is the purpose of this book?\"\n", + "docs_to_embed = [q] + chapters\n", + "embeddings = embedding_model.encode(docs_to_embed)\n", + "print(embeddings.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain four 384-dimensional vectors representing our embeddings (one for each of the three chapters and one for the input query).\n", + "\n", + "Now we can calculate similarity among the embeddings. By default, sentence transformers uses cosine similarity to calculate the similarity between embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarities = embedding_model.similarity(embeddings, embeddings)\n", + "similarities" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "tensor([[1.0000, 0.4402, 0.3022, 0.4028],\n", + " [0.4402, 1.0000, 0.6606, 0.5807],\n", + " [0.3022, 0.6606, 1.0000, 0.6313],\n", + " [0.4028, 0.5807, 0.6313, 1.0000]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize the similarity matrix to better understand the relationships between our documents in {numref}`similarities`. The top row of the matrix represents the similarity of the input query against all chapters. That's exactly what we previously obtained by querying ChromaDB which returned a response with documents ranked by similarity to input query.\n", + "\n", + "```{figure} ../_static/input/similarity.png\n", + "---\n", + "name: similarities\n", + "alt: Similarity matrix heatmap\n", + "scale: 90%\n", + "align: center\n", + "---\n", + "Similarity matrix heatmap showing relationships among query and chapters.\n", + "``` \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculating similarity among embeddings can become computationally intensive if brute force is used, i.e. pair-wise computation, as the number of documents grows in the knowledge base. Indexing is a technique to help address this challenge." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Indexing**\n", + "\n", + "Indexing is a crucial optimization technique that makes similarity searches faster and more efficient.\n", + "\n", + "Without indexing, finding similar vectors would require an exhaustive search - comparing a query vector against every single vector in the database. For large datasets, this becomes prohibitively slow.\n", + "\n", + "Common indexing strategies include:\n", + "\n", + "1. **Tree-based Indexes**\n", + " - Examples include KD-trees and Ball trees\n", + " - Work by partitioning the vector space into hierarchical regions\n", + " - Effective for low-dimensional data but suffer from the \"curse of dimensionality\"\n", + "\n", + "2. **Graph-based Indexes**\n", + " - HNSW (Hierarchical Navigable Small World) is a prominent example\n", + " - Creates a multi-layered graph structure for navigation\n", + " - Offers excellent search speed but requires more memory\n", + "\n", + "3. **LSH (Locality-Sensitive Hashing)**\n", + " - Uses hash functions that map similar vectors to the same buckets\n", + " - More memory-efficient than graph-based methods\n", + " - May sacrifice some accuracy for performance\n", + "\n", + "4. **Quantization-based Indexes**\n", + " - Product Quantization compresses vectors by encoding them into discrete values\n", + " - Reduces memory footprint significantly\n", + " - Good balance between accuracy and resource usage\n", + "\n", + "HNSW is the underlying library for Chroma vector indexing and search {cite}`chromadb2024hnsw`. HNSW provides fast searches with high accuracy but uses more memory. LSH and quantization methods offer better memory efficiency but may sacrifice some precision.\n", + "\n", + "But are indexing + basic embeddings based similarity sufficient? Often not, as we will see next as we cover reranking technique." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reranking\n", + "\n", + "Let's go back to querying our vector database. Here are additional examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we write a query about how to get structured output from LLMs. Successfully retrieving the \"Structured Output\" chapter from the book as top result." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['structured_output', 'input', 'intro']]\n" + ] + } + ], + "source": [ + "q = \"How to get structured output from LLMs?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we would like to obtain a tutorial on `Docling`, a tool we covered in this very chapter. However, we fail to obtain the correct chapter and instead obtain the \"Introduction\" chapter as a result." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['intro', 'input', 'structured_output']]\n" + ] + } + ], + "source": [ + "q = \"Docling tutorial\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval systems solely based on vector similarity search might miss semantic relevance. That brings the need for techniques that can improve accuracy of the retrieval system. One such technique is re-ranking.\n", + "\n", + "Re-ranking is a method that can improve accuracy of the retrieval system by re-ranking the retrieved documents based on their relevance to the input query.\n", + "\n", + "In the following, we will use the `sentence_transformers` library to re-rank the retrieved documents based on their relevance to the input query. We utilize the `CrossEncoder` model to re-rank the documents. Cross-Encoder models are more accurate at judging relevance at the cost of speed compared to basic vector-based similarity. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can implement a reranking step in a RAG system using a Cross-Encoder model in the following steps:\n", + "\n", + "1. First, we initialize the Cross-Encoder model:\n", + "```python\n", + "model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)\n", + "```\n", + "- Uses the `ms-marco-MiniLM-L-6-v2` model, which is specifically trained for passage reranking\n", + "- Sets a maximum sequence length of 512 tokens\n", + "- This model is designed to score the relevance between query-document pairs\n", + "\n", + "2. Then we perform the reranking:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "- Creates pairs of (query, document) for each retrieved document\n", + "- The model predicts relevance scores for each pair\n", + "- Higher scores indicate better semantic match between query and document\n", + "\n", + "3. Finally, we select the best match:\n", + "```python\n", + "print(res[\"documents\"][0][np.argmax(scores)])\n", + "```\n", + "- `np.argmax(scores)` finds the index of the highest scoring document\n", + "- Uses that index to retrieve the most relevant document\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We obtain the following scores for the retrieved documents (\"intro\", \"input\", \"structured_output\"), the higher the score, the more relevant the document is in relation to the input query.\n", + "\n", + "```\n", + "array([-8.52623 , -6.328738, -8.750055], dtype=float32)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain the index of the highest scoring document, which corresponds to the \"input\" chapter. Hence, the re-ranking step successfully retrieved the correct chapter." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input\n" + ] + } + ], + "source": [ + "print(res[\"ids\"][0][np.argmax(scores)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ideia is to first run semantic similarity on embeddings, which should be fast but potentially inaccurate, and then run re-raking on the top-k results, which is more accurate but slower. By doing so, we can balance the speed and accuracy of the retrieval system.\n", + "\n", + "Hence, instead of going over all retrieved documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "We would run reranking on the TOPK results, where TOPK <<< number of documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### LLMs with RAG\n", + "\n", + "We are finally ready to use the retrieval system to help the LLM answer our authorship question. A common way to integrate RAGs with LLMs is via in-context learning. With in-context learning the LLM learns from the retrieved documents by providing them in the context window as represented in {numref}`incontext`. This is accomplished via a prompt template structure as follows.\n", + "\n", + "```{figure} ../_static/input/incontext.svg\n", + "---\n", + "name: incontext\n", + "alt: In-Context Learning\n", + "scale: 95%\n", + "align: center\n", + "---\n", + "RAG LLM with In-Context Learning\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + " rag_system_prompt_template = f\"\"\"\n", + " You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + "\n", + " user_prompt_template = f\"\"\"\n", + " QUESTION: {input}\n", + " \"\"\"\n", + "```\n", + "\n", + "This prompt strategy demonstrates a common in-context learning pattern where retrieved documents are incorporated into the LLM's context to enhance response accuracy and relevance. The prompt structure typically consists of a system prompt that:\n", + "- Sets clear boundaries for the LLM to use information from the provided context\n", + "- Includes the retrieved documents as context\n", + "\n", + "This approach:\n", + "- Reduces hallucination by grounding responses in source documents\n", + "- Improves answer relevance by providing contextually relevant information to the LLM\n", + "\n", + "The context variable is typically populated with the highest-scoring document(s) from the retrieval step, while the input variable contains the user's original query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def RAG_qa(client, model, context, input):\n", + " \"\"\"\n", + " Generate a summary of input using a given model\n", + " \"\"\"\n", + " rag_system_prompt_template = f\"\"\"You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + " \n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": \"system\", \"content\": rag_system_prompt_template},\n", + " {\"role\": \"user\", \"content\": f\"QUESTION: {input}\"}]\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we set the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", + "\n", + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we run the retrieve step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = query_collection(collection, q)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we run the re-ranking step setting it to consider the `TOPK` retrieved documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TOPK = 2\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "res_reranked = res[\"documents\"][0][np.argmax(scores)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then pass the top document as context and invoke the LLM with our RAG-based template leading to a successful response." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The author of the book \"Taming LLMs\" is Tharsis Souza.\n" + ] + } + ], + "source": [ + "answer = RAG_qa(model, res_reranked[0], question)\n", + "answer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, we motivated the use of RAGs as a tool to equip LLMs with relevant context and provided a canonical implementation of its core components. RAGs, however, can be implemented in many shapes and forms and entire books have been written about them. We point the user to additional resources if more specialized techniques and architectures are needed {cite}`kimothi2024simpleguiderag, athinaai2024ragcookbooks, diamant2024ragtechniques, hands-on-llms-book`.\n", + "\n", + "Next, we discuss RAGs challenges and limitations and conclude our RAGs section envisioning the future of RAGs challenged by the rise of long-context language models.\n", + "\n", + "### Challenges and Limitations\n", + "\n", + "While RAG systems offer powerful capabilities for enhancing LLM responses with external knowledge, they face several significant challenges and limitations that require careful consideration:\n", + " \n", + "- **Data Quality and Accuracy**: The effectiveness of RAG systems fundamentally depends on the quality and reliability of their knowledge sources. When these sources contain inaccurate, outdated, biased, or incomplete information, the system's responses become unreliable. This challenge is particularly acute when dealing with rapidly evolving topics or when sourcing information from unverified channels.\n", + " \n", + "- **Computational Cost and Latency**: Implementing RAG systems at scale presents computational and operational challenges. The process of embedding documents, maintaining vector databases, and performing similarity searches across large knowledge bases demands computational, budget and operational resources. In real-time applications, these requirements can introduce noticeable latency, potentially degrading the user experience and limiting practical applications.\n", + " \n", + "- **Explainability and Evaluation**: The complexity of RAG systems, arising from the intricate interaction between retrieval mechanisms and generative models, makes it difficult to trace and explain their reasoning processes. Traditional evaluation metrics often fail to capture the nuanced aspects of RAG performance, such as contextual relevance and factual consistency. This limitation hampers both system improvement and stakeholder trust. Readers are encouraged to read Chapter {ref}`evals` for general LLM evaluation issues as well as consider tools such as Ragas {cite}`ragas2024evaluation` for RAG evaluation.\n", + " \n", + "- **Hallucination Management**: Though RAG systems help ground LLM responses in source documents, they do not completely eliminate hallucinations. The generative component may still produce content that extrapolates beyond or misinterprets the retrieved context. This risk becomes particularly concerning when the system confidently presents incorrect information with apparent source attribution.\n", + "\n", + "\n", + "Moreover, recent research has shed light on critical limitations of key techniques used in RAGs systems. A relevant finding pertains to reranking, which has shown {cite}`jacob2024drowningdocumentsconsequencesscaling`:\n", + "\n", + "- **Diminishing Returns**: Performance degrades as the number of documents (K) increases, sometimes performing worse than basic retrievers when dealing with large datasets.\n", + "- **Poor Document Discrimination**: Rerankers can be misled by irrelevant documents, sometimes assigning high scores to content with minimal relevance to the query.\n", + "- **Consistency Issues**: Performance and relative rankings between different rerankers can vary significantly depending on the number of documents being processed.\n", + "\n", + "### Will RAGs exist in the future?\n", + "\n", + "This question is posed as we contrast RAGs with LLMs with long-context windows (LC).\n", + "\n", + "Recent research has shed light on this specific point {cite}`li2024retrievalaugmentedgenerationlongcontext`, suggesting that, on the one hand, RAGs can be seen as a cost-effective alternative to LC models:\n", + "* RAGs offer lower computational cost compared to LC due to the significantly shorter input length required for processing.\n", + "* This cost-efficiency arises because RAG reduces the number of input tokens to LLMs, which of course reduces usage cost as pricing is based on the number of input (and output) tokens.\n", + "\n", + "On the other hand, this RAG benefit is achieved at the cost of performance:\n", + "* Recent advancements in LLMs, in particular with Gemini-1.5 and GPT-4o models, demonstrate capabilities in understanding long contexts directly, which enables them to outperform RAG in terms of average performance\n", + "* LC models can process extremely long contexts, such as Gemini 1.5 which can handle up to 1 million tokens, and these models benefit from large-scale pretraining to develop strong long-context capabilities.\n", + "\n", + "This cost-performance trade-off is illustrated in {numref}`LC`, where LC models outperform RAGs in terms of average performance while RAGs are more cost-effective.\n", + "\n", + "```{figure} ../_static/input/LC.png\n", + "---\n", + "name: LC\n", + "alt: Long-Context LLMs for Superior Performance\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Long-Context LLMs demonstrate superior performance while RAGs are more cost-effective {cite}`li2024retrievalaugmentedgenerationlongcontext`.\n", + "```\n", + "\n", + "{numref}`LC` also shows a model called \"SELF-ROUTE\" which combines RAG and LC by routing queries based on model self-reflection. This is a hybrid approach that reduces computational costs while maintaining performance comparable to LC. The advantage of SELF-ROUTE is most significant for smaller values of *k*, where *k* is the number of retrieved text chunks, and SELF-ROUTE shows a marked improvement in performance over RAG, while as k increases the performance of RAG and SELF-ROUTE approaches that of LC.\n", + "\n", + "Another example of a hybrid approach that combines the benefits of both LC and RAGs is RetroLLM {cite}`li2024retrollmempoweringlargelanguage`, which is a unified framework that integrates retrieval and generation into a single process, enabling language models to generate fine-grained evidence directly from a corpus. The key contribution is that this approach delivers those benefits while eliminating the need for a separate retriever, addressing limitations of traditional RAG methods. Experimental results demonstrate RetroLLM's superior performance compared to traditional RAG methods, across both in-domain and out-of-domain tasks. It also achieves a significant reduction in token consumption due to its fine-grained evidence retrieval.\n", + "\n", + "A relevant development in this area is the introduction of LOFT {cite}`lee2024longcontextlanguagemodelssubsume`, a benchmark to assess this paradigm shift from RAGs to LCs, using real-world tasks requiring context up to millions of tokens. Evidence suggests LCs can deliver performance with simplified pipelines compared to RAGs, particularly for tasking requiring multi-hop reasoning over long contexts when using Chain-of-Thought {cite}`wei2023chainofthoughtpromptingelicitsreasoning`. However, LCs can still be outperformed by specialized retrievers, in particular Gecko, a specialized model fine-tuned on extensive text retrieval and similarity tasks.\n", + "\n", + "Bottom-line: Do we really need RAGs? The answer is conditional:\n", + "\n", + "* **RAG may be relevant when cost-effectiveness is a key requirement** and where the model needs to access vast amounts of external knowledge without incurring high computational expenses. However, as LLMs context window sizes increase and LLMs cost per input token is decreases, RAG may not be as relevant as it was before.\n", + "* **Long-context LLMs are superior when performance is the primary concern**, and the model needs to handle extensive texts that require deep contextual understanding and reasoning.\n", + "* **Hybrid approaches like SELF-ROUTE are valuable as they combine the strengths of RAG and LC** offering a practical balance between cost and performance, especially for applications where both factors are critical.\n", + "\n", + "Ultimately, the choice between RAG, LC, or a hybrid method depends on the specific requirements of the task, available resources, and the acceptable trade-off between cost and performance.\n", + "\n", + "In a later case study, we demonstrate the power of LCs as we construct a Quiz generator with citations over a large knowledge base without the use of chunking nor RAGs.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A Note on Frameworks\n", + "\n", + "We have covered a few open source tools for parsing data and provided a canonical RAG pipeline directly using an open source VectorDB together with an LLM. There is a growing number of frameworks that offer similar functionality wrapping the same core concepts at a higher level of abstraction. The two most popular ones are `Langchain` and `LlamaIndex`. \n", + "\n", + "For instance, the code below shows how to use `LlamaIndex`'s `LlamaParse` for parsing input documents, which offers support for a wide range of file formats (e.g. .pdf, .pptx, .docx, .xlsx, .html). We we can see that the code is very similar to the one we used for `MarkitDown` and `Docling`.\n", + "\n", + "```python\n", + "from llama_parse import LlamaParse\n", + "\n", + "# Initialize the parser\n", + "parser = LlamaParse(\n", + " api_key=\"llx-your-api-key-here\",\n", + " result_type=\"markdown\", # Can be \"markdown\" or \"text\"\n", + " verbose=True\n", + ")\n", + "\n", + "documents = parser.load_data([\"./doc1.pdf\", \"./doc2.pdf\"])\n", + "```\n", + "\n", + "\n", + "\n", + "As another example, the code below replicates our ChromaDB-based retrieval system using `LlamaIndex` {cite}`llamaindex2024storing`.\n", + "\n", + "As we can see, similar concepts are used in both frameworks:\n", + "- Documents to represent elements of the knowledge base\n", + "- Collections to store the documents\n", + "- Indexing of embeddings in the VectorDB and finally\n", + "- Querying the VectorDB to retrieve the documents\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import chromadb\n", + "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", + "from llama_index.vector_stores.chroma import ChromaVectorStore\n", + "from llama_index.core import StorageContext\n", + "\n", + "# load some documents\n", + "documents = SimpleDirectoryReader(\"./data\").load_data()\n", + "\n", + "# initialize client, setting path to save data\n", + "db = chromadb.PersistentClient(path=\"./chroma_db\")\n", + "\n", + "# create collection\n", + "chroma_collection = db.get_or_create_collection(\"tamingllms\")\n", + "\n", + "# assign chroma as the vector_store to the context\n", + "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "# create your index\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")\n", + "\n", + "# create a query engine and query\n", + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Who is the author of Taming LLMs?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Frameworks are useful for quickly prototyping RAG systems and for building applications on top of them as they provide a higher level of abstraction and integration with third-party libraries. However, the underlying concepts are the same as the ones we have covered in this chapter. More often than not, problems arise when developers either do not understand the underlying concepts or fail to understand the details of the implement behind the abstractions provided by the framework. Therefore, it is recommended to try and start your implementation using lower level tools as much as possible and only when (i) the underlying problem as well as (ii) the desired solution are well understood, then consider moving to higher level frameworks if really needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Studies\n", + "\n", + "This section presents two case studies to complement topics we have covered in this chapter in the context of managing input data for LLMs.\n", + "\n", + "First, we cover content chunking, in particular Content Chunking with Contextual Linking which showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "\n", + "Second, we build a Quiz generator with citations using long context window. Not all knowledge intense applications require RAGs. In this case study, we show how to use long context window as well as some additional input management techniques such as prompt caching for efficiency and reference management to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study I: Content Chunking with Contextual Linking\n", + "\n", + "Content chunking is commonly used to breakdown long-form content into smaller, manageable chunks. In the context of RAGs, this can be helpful not only to help the retrieval system find more contextually relevant documents but also lead to a more cost efficient LLM solution since fewer tokens are processed in the context window. Furthermore, semantic chunking can increase accuracy of RAG systems {cite}`zenml2024rag`.\n", + "\n", + "Content chunking with contextual linking is a chunking technique that seeks to split input content while keeping chunk-specific context, hence allowing the LLM to maintain coherence and context when generating responses per chunks. In that way, this technique tackles two key problems:\n", + "1. The LLM's inability to process long inputs to do context-size limits\n", + "2. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "\n", + "As a consequence, a third problem is also tackled: LLM's inability to generate long-form content due to the `max_output_tokens` limitation. Since we generate responses per chunk, as we will see later, we end up with a solution that is capable of generating long-form content while maintaining coherence.\n", + "\n", + "We exemplify this technique by following these steps:\n", + "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "\n", + "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "\n", + "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "\n", + "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "\n", + "Let's examine an example implementation of this technique.\n", + "\n", + "#### Generating long-form content\n", + "\n", + "- Goal: Generate a long-form report analyzing a company's financial statement.\n", + "- Input: A company's 10K SEC filing.\n", + "\n", + "```{figure} ../_static/structured_output/diagram1.png\n", + "---\n", + "name: content-chunking-with-contextual-linking\n", + "alt: Content Chunking with Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Content Chunking with Contextual Linking Schematic Representation.\n", + "```\n", + "\n", + "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "\n", + "**Step 1: Chunking the Content**\n", + "\n", + "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", + "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", + "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", + " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", + " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", + " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "\n", + " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", + " \"\"\"\n", + " Split input text into chunks of specified size with specified overlap.\n", + "\n", + " Args:\n", + " text (str): The input text to be chunked.\n", + " chunk_size (int): The maximum size of each chunk in tokens.\n", + " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "\n", + " Returns:\n", + " list: A list of text chunks.\n", + " \"\"\"\n", + " from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " return text_splitter.split_text(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 2: Writing the Base Prompt Template**\n", + "\n", + "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", + "- `role`: Defines the role or persona the model should assume.\n", + "- `context`: Provides the background information or context for the task.\n", + "- `instruction`: Specifies the task or action the model needs to perform.\n", + "- `input_text`: Contains the actual text input that the model will process.\n", + "- `requirements`: Lists any specific requirements or constraints for the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.prompts import PromptTemplate\n", + "def get_base_prompt_template() -> str:\n", + " \n", + " base_prompt = \"\"\"\n", + " ROLE: {role}\n", + " CONTEXT: {context}\n", + " INSTRUCTION: {instruction}\n", + " INPUT: {input}\n", + " REQUIREMENTS: {requirements}\n", + " \"\"\"\n", + " \n", + " prompt = PromptTemplate.from_template(base_prompt)\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_community.chat_models import ChatLiteLLM\n", + "\n", + "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", + " \"\"\"\n", + " Returns an LLMChain instance using langchain.\n", + "\n", + " Args:\n", + " prompt_template (str): The prompt template to use.\n", + " model_name (str): The name of the model to use.\n", + " temperature (float): The temperature setting for the model.\n", + "\n", + " Returns:\n", + " llm_chain: An instance of the LLMChain.\n", + " \"\"\"\n", + " \n", + " from dotenv import load_dotenv\n", + " import os\n", + "\n", + " # Load environment variables from .env file\n", + " load_dotenv()\n", + " \n", + " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", + " llm = ChatLiteLLM(\n", + " model=model_name,\n", + " temperature=temperature,\n", + " api_key=os.environ[api_key_label],\n", + " )\n", + " llm_chain = prompt_template | llm | StrOutputParser()\n", + " return llm_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 3: Constructing Dynamic Prompt Parameters**\n", + "\n", + "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "def get_dynamic_prompt_params(prompt_params: Dict, \n", + " part_idx: int, \n", + " total_parts: int,\n", + " chat_context: str,\n", + " chunk: str) -> str:\n", + " \"\"\"\n", + " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", + " \n", + " Args:\n", + " prompt_params (Dict): Original prompt parameters\n", + " part_idx (int): Index of current conversation part\n", + " total_parts (int): Total number of conversation parts\n", + " chat_context (str): Chat context from previous parts\n", + " chunk (str): Current chunk of text to be processed\n", + " Returns:\n", + " str: Dynamically constructed prompt template with part-specific params\n", + " \"\"\"\n", + " dynamic_prompt_params = prompt_params.copy()\n", + " # saves the chat context from previous parts\n", + " dynamic_prompt_params[\"context\"] = chat_context\n", + " # saves the current chunk of text to be processed as input\n", + " dynamic_prompt_params[\"input\"] = chunk\n", + " \n", + " # Add part-specific instructions\n", + " if part_idx == 0: # Introduction part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the Introduction part of a long report.\n", + " Don't cover any topics yet, just define the scope of the report.\n", + " \"\"\"\n", + " elif part_idx == total_parts - 1: # Conclusion part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the last part of a long report. \n", + " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", + " \"\"\"\n", + " else: # Main analysis part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", + " For this part, analyze the below INPUT.\n", + " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", + " \"\"\"\n", + " \n", + " return dynamic_prompt_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Step 4: Generating the Report**\n", + "\n", + "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_report(input_content: str, llm_model_name: str, \n", + " role: str, requirements: str,\n", + " chunk_size: int, chunk_overlap: int) -> str:\n", + " # stores the parts of the report, each generated by an individual LLM call\n", + " report_parts = [] \n", + " # split the input content into chunks\n", + " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", + " # initialize the chat context with the input content\n", + " chat_context = input_content\n", + " # number of parts to be generated\n", + " num_parts = len(chunks)\n", + "\n", + " prompt_params = {\n", + " \"role\": role, # user-provided\n", + " \"context\": \"\", # dinamically updated per part\n", + " \"instruction\": \"\", # dynamically updated per part\n", + " \"input\": \"\", # dynamically updated per part\n", + " \"requirements\": requirements #user-priovided\n", " }\n", "\n", " # get the LLMChain with the base prompt template\n", @@ -2076,14 +2981,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study II: Github RAG\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Case Study III: Quiz Generation with Citations\n", + "### Case Study II: Quiz Generation with Citations\n", "\n", "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n", "\n", @@ -2400,7 +3298,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Conclusion" + "## Conclusion\n", + "\n", + "This chapter has explored critical strategies and techniques for managing input data in LLM applications, focusing on three key areas: data parsing, retrieval augmentation, and practical implementation patterns. We examined how parsing tools like MarkItDown and Docling can transform diverse data formats into LLM-compatible representations, demonstrating through case studies how parser quality can impact LLM performance. The chapter also investigated retrieval augmentation techniques, particularly RAG systems, showing how they can enhance LLM capabilities by providing access to external knowledge while discussing their future relevance in the context of emerging long-context language models.\n", + "\n", + "Through our case studies, we demonstrated practical approaches to handling common challenges in LLM applications. The Content Chunking with Contextual Linking case study illustrated techniques for managing long-form content generation while maintaining coherence across chunks. The Quiz Generation with Citations case study showcased how long-context windows can be effectively utilized without the need for complex retrieval systems, highlighting the importance of choosing the right approach based on specific application requirements rather than defaulting to more complex solutions.\n", + "\n", + "As the field continues to evolve, the choice between traditional RAG systems and emerging long-context models will likely become increasingly nuanced. While RAGs offer cost-effective solutions for incorporating external knowledge, the rise of long-context models suggests a future where simpler architectures might suffice for many applications. The key insight is that effective input data management requires careful consideration of trade-offs among complexity, cost, and performance, always guided by specific application requirements rather than following a one-size-fits-all approach. Success in building robust LLM applications will depend on understanding these trade-offs and selecting appropriate strategies for each use case." ] }, { diff --git a/tamingllms/_static/evals/llm_judge.svg b/tamingllms/_static/evals/llm_judge.svg deleted file mode 100644 index 4292dfa..0000000 --- a/tamingllms/_static/evals/llm_judge.svg +++ /dev/null @@ -1,879 +0,0 @@ -LLM Judge Evaluation SystemLLM-Judgecomponentsapps
    App Rankings
    -Detailed Scores
    -Analysis Report
    -
    -
    Task description
    -Scoring guidelines
    -Output format
    -
    -
    (Optional)Ground TruthLLM App 1LLM App 2

    ...

    -
    LLM App N Generate EvaluationPrompt Compare ResultsSubmit for Review - - - - - - - - - - - - - - - - -
    \ No newline at end of file diff --git a/tamingllms/_static/evals/llmjudge.d2 b/tamingllms/_static/evals/llmjudge.d2 index 4e178ae..7f0d25e 100644 --- a/tamingllms/_static/evals/llmjudge.d2 +++ b/tamingllms/_static/evals/llmjudge.d2 @@ -14,6 +14,22 @@ container: { label: "LLM-Judge" } + # Evaluation results + results: { + score: Score { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + } + + explanation: Explanation { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + label: "Reasoning for score" + } + } + # Evaluation components section components: { prompt: Evaluation Prompt { @@ -51,9 +67,7 @@ container: { style.stroke: "#3498DB" } - dots: |md - ... - | + dots: "..." appN: LLM App N { shape: rectangle @@ -62,22 +76,9 @@ container: { } } - # Output section - output: Evaluation Results { - shape: page - style.fill: "#EAFAF1" - style.stroke: "#2ECC71" - label: |md - ``` - App Rankings - Detailed Scores - Analysis Report - ``` - | - } - # Connections between components - base_llm -> output: Generate Evaluation + base_llm -> results.score + base_llm -> results.explanation components.prompt -> base_llm: Prompt components.reference -> base_llm: Compare Results { @@ -89,4 +90,4 @@ container: { apps.appN -> base_llm } -direction: right +direction: right \ No newline at end of file diff --git a/tamingllms/_static/evals/meta2.svg b/tamingllms/_static/evals/meta2.svg deleted file mode 100644 index 8833843..0000000 --- a/tamingllms/_static/evals/meta2.svg +++ /dev/null @@ -1,882 +0,0 @@ -LLM Judge Pairwise Evaluation SystemPool of LLM JudgesPairwiseSelectorllmcomparison_pairHumanEvaluatorsRankingAlgorithm
    LLM Judges Leaderboard
    ----------------------
    -1. Judge C (0.95)
    -2. Judge A (0.92)
    -3. Judge B (0.89)
    -   ...
    -N. Judge X (0.75)
    -
    -
    PromptLLM ResponseJudge AvsJudge B Draw JudgesGenerate PairInput forEvaluationEvaluatePreferencesGenerateRankings - - - - - - - - - - - - - - - - - - - - -
    \ No newline at end of file diff --git a/tamingllms/_static/input/LC.png b/tamingllms/_static/input/LC.png new file mode 100644 index 0000000..72602d1 Binary files /dev/null and b/tamingllms/_static/input/LC.png differ diff --git a/tamingllms/_static/input/embedding.d2 b/tamingllms/_static/input/embedding.d2 new file mode 100644 index 0000000..e9185c4 --- /dev/null +++ b/tamingllms/_static/input/embedding.d2 @@ -0,0 +1,38 @@ +container: { + shape: rectangle + style.stroke: "#D5DBDB" + style.stroke-width: 2 + style.fill: "#F7FBFF" + + input: "Who is the Author of..." { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#2ECC71" + style.font-color: "#2ECC71" + } + + model: { + shape: rectangle + style.fill: "#FEF9E7" + style.stroke: "#F4D03F" + + network: "all-MiniLM-L6-v2" { + style.font-size: 24 + } + } + + output: "[0.123, 0.456, 0.789, ...]" { + shape: rectangle + style.fill: "#FFFFFF" + style.stroke: "#E74C3C" + style.font-color: "#E74C3C" + } + + # Connections + input -> model -> output + + # Label below model + label: "Embedding" +} + +direction: right diff --git a/tamingllms/_static/input/embedding.svg b/tamingllms/_static/input/embedding.svg new file mode 100644 index 0000000..adbe91b --- /dev/null +++ b/tamingllms/_static/input/embedding.svg @@ -0,0 +1,118 @@ + + + + + + + + +EmbeddingWho is the Author of...model[0.123, 0.456, 0.789, ...]all-MiniLM-L6-v2 + + + + + + + \ No newline at end of file diff --git a/tamingllms/_static/input/incontext.svg b/tamingllms/_static/input/incontext.svg new file mode 100644 index 0000000..82c636f --- /dev/null +++ b/tamingllms/_static/input/incontext.svg @@ -0,0 +1,4 @@ + + + +
    Retrieval
    Retrieval
    RAG Context
    RAG Context
    reranking
    reranking
    Query
    Query

    LLM

    LLM

    Context Window

    Context Wi...
    Retrieval System
    Retrieval System
    VectorDB
    VectorDB
    \ No newline at end of file diff --git a/tamingllms/_static/input/incontext.xml b/tamingllms/_static/input/incontext.xml new file mode 100644 index 0000000..1a15d1d --- /dev/null +++ b/tamingllms/_static/input/incontext.xml @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tamingllms/_static/input/rag.svg b/tamingllms/_static/input/rag.svg new file mode 100644 index 0000000..6b77e28 --- /dev/null +++ b/tamingllms/_static/input/rag.svg @@ -0,0 +1,4 @@ + + + +
    Data Parsing & Ingestion
    Data
    Embeddings
    Retrieval
    RAG Context
    reranking
    Query

    LLM

    Context Window

    Indexing
    Query
    User
    VectorDB
    Retrieval System
    RAG
    \ No newline at end of file diff --git a/tamingllms/_static/input/rag.xml b/tamingllms/_static/input/rag.xml new file mode 100644 index 0000000..7c6a681 --- /dev/null +++ b/tamingllms/_static/input/rag.xml @@ -0,0 +1,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tamingllms/_static/input/similarity.png b/tamingllms/_static/input/similarity.png new file mode 100644 index 0000000..4f2f228 Binary files /dev/null and b/tamingllms/_static/input/similarity.png differ diff --git a/tamingllms/_static/input/vectordb.png b/tamingllms/_static/input/vectordb.png new file mode 100644 index 0000000..da956ce Binary files /dev/null and b/tamingllms/_static/input/vectordb.png differ diff --git a/tamingllms/latex/evals.tex b/tamingllms/latex/evals.tex new file mode 100644 index 0000000..792c2af --- /dev/null +++ b/tamingllms/latex/evals.tex @@ -0,0 +1,1910 @@ +\setchapterpreamble[u]{\margintoc} + +\chapter{The Evals Gap} +\labch{evals} + +\epigraph{It doesn't matter how beautiful your theory is, \\ +it doesn't matter how smart you are. \\ +If it doesn't agree with experiment, it's wrong.}{Richard Feynman} + +\section{Introduction} + +The advent of LLMs marks a pivotal shift in the landscape of software development, testing and verification. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products. + +For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks. + +To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front that incorporate the generative nature of LLM-based software development while fostering a culture of continuous change, learning and adaptation. + +\section{Non-Deterministic Generative Machines} + +One of the most fundamental challenges when building products with LLMs is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they're queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering and product challenge. + +When you ask an LLM the same question multiple times, you'll likely get different responses. This isn't a bug - it's a fundamental feature of how these models work. The ``temperature'' parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems. + +Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that: +\begin{itemize} + \item The same input data could yield different analysis conclusions + \item Regulatory compliance becomes challenging to guarantee + \item User trust may be affected by inconsistent responses + \item Testing becomes exceedingly more complex compared to traditional software +\end{itemize} + +The primary source of non-determinism in LLMs comes from their sampling strategies. During text generation, the model: +\begin{enumerate} + \item Calculates probability distributions for each next token + \item Samples from these distributions based on temperature settings + \item Uses techniques like nucleus sampling \sidecite{holtzman2020curiouscaseneuraltext} or top-k sampling to balance creativity and coherence +\end{enumerate} + +In this simple experiment, we use an LLM to write a single-statement executive summary from an input financial filing. We observe that even a simple parameter like temperature can dramatically alter model behavior in ways that are difficult to systematically assess. At temperature $0.0$, responses are consistent but potentially too rigid. At $1.0$, outputs become more varied but less predictable. At $2.0$, responses can be wildly different and often incoherent. This non-deterministic behavior makes traditional software testing approaches inadequate. + +\begin{minted}{python} +from dotenv import load_dotenv +import os + +# Load environment variables from .env file +load_dotenv() + +from openai import OpenAI +import pandas as pd +from typing import List + +def generate_responses( + model_name: str, + prompt: str, + temperatures: List[float], + attempts: int = 3 +) -> pd.DataFrame: + """ + Generate multiple responses at different temperature settings + to demonstrate non-deterministic behavior. + """ + client = OpenAI() + results = [] + + for temp in temperatures: + for attempt in range(attempts): + response = client.chat.completions.create( + model=model_name, + messages=[{"role": "user", "content": prompt}], + temperature=temp, + max_tokens=50 + ) + + results.append({ + 'temperature': temp, + 'attempt': attempt + 1, + 'response': response.choices[0].message.content + }) + + # Display results grouped by temperature + df_results = pd.DataFrame(results) + for temp in temperatures: + print(f"\nTemperature = {temp}") + print("-" * 40) + temp_responses = df_results[df_results['temperature'] == temp] + for _, row in temp_responses.iterrows(): + print(f"Attempt {row['attempt']}: {row['response']}") + + return df_results +\end{minted} +% End of Selection + + +\begin{minted}{python} +MAX_LENGTH = 10000 # We limit the input length to avoid token issues +with open('../data/apple.txt', 'r') as file: + sec_filing = file.read() + +sec_filing +\end{minted} + +\begin{figure}[h] +\centering +\includegraphics[width=0.7\textwidth]{evals/apple_sec.png} +\caption{Part of Apple Inc's SEC Filing. November 1, 2024 - 10-K: Annual report for year ending September 28, 2024} +\label{fig:apple-sec-temps} +\end{figure} + + +\begin{minted}{python} +MAX_LENGTH = 10000 # We limit the input length to avoid token issues +with open('../data/apple.txt', 'r') as file: + sec_filing = file.read() +sec_filing = sec_filing[:MAX_LENGTH] +df_results = generate_responses(model_name="gpt-3.5-turbo", + prompt=f"Write a single-statement executive summary of the following text: {sec_filing}", + temperatures=[0.0, 1.0, 2.0]) +\end{minted} + + +\begin{verbatim} +Temperature = 0.0 +---------------------------------------- +Attempt 1: Apple Inc. filed its Form 10-K for the fiscal year ended September 28, 2024 with the SEC, detailing its business operations and financial performance. +Attempt 2: Apple Inc. filed its Form 10-K with the SEC for the fiscal year ended September 28, 2024, detailing its business operations, products, and financial information. +Attempt 3: Apple Inc. filed its Form 10-K with the SEC for the fiscal year ended September 28, 2024, detailing its business operations, products, and financial information. + +Temperature = 1.0 +---------------------------------------- +Attempt 1: Apple Inc., a well-known seasoned issuer based in California, designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories, with a focus on innovation and technology. +Attempt 2: Apple Inc. filed its Form 10-K with the SEC for the fiscal year ended September 28, 2024, reporting on its business operations, products, and financial performance. +Attempt 3: Apple Inc., a well-known seasoned issuer, filed its Form 10-K for the fiscal year ended September 28, 2024, reporting on its financial condition and operations. + +Temperature = 2.0 +---------------------------------------- +Attempt 1: The Form 10-K for Apple Inc. for the fiscal year ended September 28, 2024, filed with the Securities and Exchange Commission, outlines the company's financial performance, products, and risk factors affecting future results. +Attempt 2: Apple Inc., a California-based company and leading technology manufacturer invDestacksmeticsisdiction setIspection-$20cyan evaluationseld anvisions droitEntering discernminerval Versbobprefversible vo该 Option和 meio forecast времCisco dellaischenpoihsCapabilities Geme.getTime future +Attempt 3: Apple Inc's Form 10-K provides a comprehensive overview of the company's financial reporting, business operations, products and market information. +\end{verbatim} + +A temperature of 1 represents the unscaled probability scores for each token in the vocabulary. Decreasing the temperature closer to 0 sharpens the distribution, so the most likely token will have an even higher probability score. Conversely, increasing the temperature makes the distribution more uniform \sidecite{build-llms-from-scratch-book}: +\begin{itemize} + \item Temperature = 0: Most deterministic, but potentially repetitive + \item Temperature = 1: Balanced creativity and coherence + \item Temperature > 1: Increased randomness, potentially incoherent +\end{itemize} + +How can one effectively test an LLM-powered system when the same prompt can yield radically different outputs based on a single parameter? Traditional testing relies on predictable inputs and outputs, but LLMs force us to grapple with probabilistic behavior. While lower temperatures may seem safer for critical applications, they don't necessarily eliminate the underlying uncertainty. This highlights the need for new evaluation paradigms that can handle both deterministic and probabilistic aspects of LLM behavior. + +\section{Emerging Properties} + +Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren't explicitly programmed but rather emerge ``naturally'' as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications. + +Figure~\ref{fig:emerging-properties} provides a list of emergent abilities of large language models and the scale \sidecite{wei2022emergentabilitieslargelanguage}. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested. + +\begin{figure}[h] +\centering +\includegraphics[width=0.6\textwidth]{evals/emerging.png} +\caption{Emergent abilities of large language models and the scale \cite{wei2022emergentabilitieslargelanguage}} +\label{fig:emerging-properties} +\end{figure} + +The implications for evaluation are critical. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation. + +\section{Problem Statement} + +Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you're not just testing predefined features - you're trying to evaluate emergent capabilities like adapting explanations to a child's level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content. + +This fundamental difference raises critical questions about evaluation: +\begin{itemize} + \item How do we measure capabilities that weren't explicitly programmed? + \item How can we ensure consistent performance when abilities may suddenly emerge or evolve? + \item What metrics can capture both the technical accuracy and the subjective quality of responses? +\end{itemize} + +The challenge becomes even more complex when we consider that traditional software evaluation methods simply weren't designed for these kinds of systems. There is an \textbf{Evals Gap} between traditional software testing and LLM evaluation. We need new frameworks that can account for both the deterministic aspects we're used to testing and the emergent properties that make LLMs unique. + +Table~\ref{evals-table} summarizes how LLM evaluation differs from traditional software testing across several key dimensions: +\begin{itemize} + \item \textbf{Capability Assessment vs Functional Testing}: Traditional software testing validates specific functionality against predefined requirements. LLM evaluation must assess not necessarily pre-defined behavior but also ``emergent properties'' like reasoning, creativity, and language understanding that extend beyond explicit programming. + + \item \textbf{Metrics and Measurement Challenges}: While traditional software metrics can usually be precisely defined and measured, LLM evaluation often involves subjective qualities like ``helpfulness'' or ``naturalness'' that resist straightforward quantification. Even when we try to break these down into numeric scores, the underlying judgment often remains inherently human and context-dependent. + + \item \textbf{Dataset Contamination}: Traditional software testing uses carefully crafted test cases with known inputs and expected outputs (e.g., unit tests). In contrast, LLMs trained on massive internet-scale datasets risk having already seen and memorized evaluation examples during training, which can lead to artificially inflated performance scores. This requires careful dataset curation to ensure test sets are truly unseen by the model and rigorous cross-validation approaches. + + \item \textbf{Benchmark Evolution}: Traditional software maintains stable test suites over time. LLM benchmarks continuously evolve as capabilities advance, making longitudinal performance comparisons difficult and potentially obsoleting older evaluation methods. + + \item \textbf{Human Evaluation Requirements}: Traditional software testing automates most validation. LLM evaluation may demand significant human oversight to assess output quality, appropriateness, and potential biases through structured annotation and systematic review processes. +\end{itemize} + +\begin{table}[h] +\caption{Evals of Traditional Software vs LLMs} +\label{evals-table} +\begin{tabular}{p{0.2\textwidth}p{0.35\textwidth}p{0.35\textwidth}} +\hline +\textbf{Aspect} & \textbf{Traditional Software} & \textbf{LLMs} \\ +\hline +Capability Assessment & Validates specific functionality against requirements & May assess emergent properties like reasoning and creativity \\ +\hline +Metrics and Measurement & Precisely defined and measurable metrics & Subjective qualities that resist straightforward quantification \\ +\hline +Dataset Contamination & Uses carefully crafted test cases & Risk of memorized evaluation examples from training \\ +\hline +Benchmark Evolution & Maintains stable test suites & Continuously evolving benchmarks as capabilities advance \\ +\hline +Human Evaluation & Mostly automated validation & May require significant human oversight \\ +\hline +\end{tabular} +\end{table} + +\section{Evals Design} + +A critical distinction must be made between evaluating an LLM versus evaluating an LLM-based application. While LLMs offer foundation capabilities and are typically general-purpose, LLM-based applications are more specific and tailored to particular use cases. An LLM-based application can be defined as a system that uses one or more LLMs to perform a specific task. More precisely, it represents the combination of one or more LLM models, their associated prompts and parameters configured to solve a particular business problem. + +This differentiation significantly impacts the scope of evaluation. LLMs are typically evaluated based on their fundamental capabilities, including language understanding, reasoning and knowledge. In contrast, LLM-based applications should be evaluated based on their end-to-end functionality, performance, and how effectively they meet business requirements. This distinction carries several key implications for designing evaluation systems: + +\begin{itemize} + \item The same LLM can yield different results in different applications + \item Evaluation must align with business objectives + \item A great LLM does not guarantee a great application +\end{itemize} + +Examples of key requirements for validation are listed in Table~\ref{validation-requirements}, ranging from Safety, Cognitive, Technical, Meta-Cognitive, to Ethical aspects. +The validation requirements for LLM applications span multiple critical categories, each with specific testing needs and importance: + +\textbf{Safety Requirements} +\begin{itemize} + \item \textbf{Misinformation Prevention} + \begin{itemize} + \item Testing needs: Factual accuracy verification, response consistency, hallucination detection, citation accuracy, uncertainty handling, temporal consistency, scientific accuracy + \item Importance: Prevents real-world harm, maintains user trust, reduces legal risks, ensures reliable decision support + \end{itemize} + + \item \textbf{Unqualified Advice Prevention} + \begin{itemize} + \item Testing needs: Recognition of professional queries, disclaimer consistency, referral mechanisms, boundary recognition, emergency handling + \item Importance: Prevents harm from incorrect advice, reduces liability, protects vulnerable users + \end{itemize} + + \item \textbf{Bias Detection} + \begin{itemize} + \item Testing needs: Gender/racial/cultural bias assessment, demographic representation, language inclusivity, stereotype avoidance + \item Importance: Prevents bias reinforcement, ensures service equality, maintains social responsibility + \end{itemize} + + \item \textbf{Privacy Protection} + \begin{itemize} + \item Testing needs: PII handling, data anonymization, information leakage prevention, compliance verification + \item Importance: Protects confidentiality, ensures regulatory compliance, prevents breaches + \end{itemize} +\end{itemize} + +\textbf{Cognitive Requirements} +\begin{itemize} + \item \textbf{Reasoning \& Logic} + \begin{itemize} + \item Testing needs: Problem-solving capability, mathematical accuracy, logical fallacy detection, causal reasoning + \item Importance: Ensures reliable problem-solving, maintains computational accuracy + \end{itemize} + + \item \textbf{Language Understanding} + \begin{itemize} + \item Testing needs: Context maintenance, idiom comprehension, cultural references, technical terminology + \item Importance: Ensures effective communication, prevents misunderstandings + \end{itemize} +\end{itemize} + +\textbf{Technical Requirements} +\begin{itemize} + \item \textbf{Code Generation} + \begin{itemize} + \item Testing needs: Syntax accuracy, security scanning, performance optimization, documentation quality + \item Importance: Ensures code reliability, prevents security issues + \end{itemize} + + \item \textbf{System Integration} + \begin{itemize} + \item Testing needs: API handling, rate limits, error management, response time, scalability + \item Importance: Ensures system reliability, maintains performance + \end{itemize} +\end{itemize} + +\textbf{Meta-Cognitive Requirements} +\begin{itemize} + \item \textbf{Self-Awareness} + \begin{itemize} + \item Testing needs: Knowledge limitation recognition, uncertainty communication, correction capabilities + \item Importance: Builds trust, prevents overconfidence + \end{itemize} + + \item \textbf{Communication Quality} + \begin{itemize} + \item Testing needs: Message clarity, audience appropriateness, information density + \item Importance: Ensures understanding, maintains engagement + \end{itemize} +\end{itemize} + +\textbf{Ethical Requirements} +\begin{itemize} + \item \textbf{Harmful Content Prevention} + \begin{itemize} + \item Testing needs: Content recognition, response appropriateness, filtering mechanisms + \item Importance: Protects user safety, prevents misuse + \end{itemize} + + \item \textbf{Decision-Making} + \begin{itemize} + \item Testing needs: Moral consistency, value alignment, fairness assessment + \item Importance: Ensures ethical deployment, maintains standards + \end{itemize} +\end{itemize} + +\textbf{Environmental Requirements} +\begin{itemize} + \item \textbf{CO2 Emission} + \begin{itemize} + \item Testing needs: Energy consumption monitoring, model efficiency, server optimization + \item Importance: Reduces environmental impact, supports sustainability + \end{itemize} +\end{itemize} + + + +\section{Conceptual Overview} + +Figure~\ref{fig:conceptual} demonstrates a conceptual design of key components of LLM Application evaluation. + +\begin{figure}[h] +\centering +\includegraphics[width=0.4\textwidth]{evals/conceptual.png} +\caption{Conceptual overview of LLM-based application evaluation.} +\label{fig:conceptual} +\end{figure} +We observe three key components: + +\textbf{1. Examples (Input Dataset)} +\begin{itemize} + \item \textbf{Input:} Query to LLM App, e.g. user message, input file, image, audio, etc. + \item \textbf{Output:} A reference expected outcome from the LLM application. Provide ground truth for comparison (\textit{Optional}). + \item \textbf{Purpose:} Provides standardized test cases for evaluation. +\end{itemize} + +\textbf{2. LLM Application (Processing Layer)} +\begin{itemize} + \item \textbf{Input:} Test cases input from Examples + \item \textbf{Output:} Generated responses/results + \item \textbf{Purpose:} + \begin{itemize} + \item Represents different LLM implementations/vendors solving a specific task + \item Could be different models (GPT-4, Claude, PaLM, etc.) + \item Could be different configurations of same model + \item Could be different prompting strategies + \end{itemize} +\end{itemize} + +\textbf{3. Evaluator (Assessment Layer)} +\begin{itemize} + \item \textbf{Input:} + \begin{itemize} + \item Outputs from LLM application + \item Reference data from Examples (\textit{Optional}) + \end{itemize} + \item \textbf{Output:} Individual scores for target LLM application + \item \textbf{Purpose:} + \begin{itemize} + \item Measures LLM Application performance across defined metrics + \item Applies standardized scoring criteria + \end{itemize} +\end{itemize} +Note that \textbf{Examples} must provide input data to the LLM Application for further evaluation. However, ground truth data is optional. We will return to this in more detail below, where we will see that ground truth data is not always available or practical. Additionally, there are approaches where one can evaluate LLM Applications without ground truth data. + +A more general conceptual design is shown in Figure~\ref{fig:conceptual-multi}, where multiple LLM Applications are evaluated. This design allows for a more comprehensive evaluation of different configurations of LLM-based applications, e.g.: +\begin{itemize} + \item Fixing all application parameters and evaluating different LLM models with their default configurations + \item Fixing all parameters of an LLM model and evaluating different prompting strategies +\end{itemize} + +\begin{figure}[h] +\centering +\includesvg[width=0.5\textwidth]{evals/conceptual-multi.svg} +\caption{Conceptual overview of Multiple LLM-based applications evaluation.} +\label{fig:conceptual-multi} +\end{figure} + +In this evaluation framework, the same inputs are provided to all LLM applications, ensuring that responses are evaluated consistently. Performance is quantified objectively for each LLM Application, and results are ranked for easy comparison. This design leads to two additional components: +\textbf{4. Scores (Metrics Layer)} +\begin{itemize} + \item \textbf{Input:} Evaluation results from Evaluator + \item \textbf{Output:} Quantified performance metrics + \item \textbf{Purpose:} + \begin{itemize} + \item Represents performance in numerical form + \item Enables quantitative comparison among LLM applications + \item May include multiple metrics per LLM application + \end{itemize} +\end{itemize} + +\textbf{5. Leaderboard (Ranking Layer)} +\begin{itemize} + \item \textbf{Input:} Scores per LLM application + \item \textbf{Output:} Ordered ranking of LLMs with scores + \item \textbf{Purpose:} + \begin{itemize} + \item Aggregates and ranks performances across LLM applications + \end{itemize} +\end{itemize} + +\section{Design Considerations} +\subsection{Design Considerations} + +The design of an LLM application evaluation system depends heavily on the specific use case and business requirements. Here we list important questions for planning an LLM application evaluation system pertaining to each of the key components previously introduced: + +\textbf{1. Examples (Input Dataset):} +\begin{itemize} + \item What types of examples should be included in the test set? + \begin{itemize} + \item Does it cover all important use cases? + \item Are edge cases represented? + \item Is there a good balance of simple and complex examples? + \end{itemize} + \item How do we ensure data quality? + \begin{itemize} + \item Are the examples representative of real-world scenarios? + \item Is there any bias in the test set? + \end{itemize} + \item Should we have separate test sets for different business requirements? + \item Do we need human-validated ground truth for all examples? + \item Can we use synthetic data to augment the test set? + \item How can business updates and user data be reflected in the dataset post-launch? +\end{itemize} + +\textbf{2. LLM Applications:} +\begin{itemize} + \item What aspects of each LLM app should be standardized for fair comparison? + \begin{itemize} + \item Prompt templates + \item Context length + \item Temperature and other parameters + \item Rate limiting and timeout handling + \end{itemize} + \item What specific configurations impact business requirements? + \begin{itemize} + \item Which LLM application variations should be tested to maximize what we learn? + \item Which LLM capabilities provide the most value for the business and how can we measure that? + \end{itemize} +\end{itemize} + +\textbf{3. Evaluator Design:} +\begin{itemize} + \item How do we define success for different types of tasks? + \begin{itemize} + \item Task-specific evaluation criteria + \item Objective metrics vs subjective assessment + \end{itemize} + \item Should evaluation be automated or involve human review? + \begin{itemize} + \item Balance between automation and human judgment + \item Inter-rater reliability for human evaluation + \item Cost and scalability considerations + \end{itemize} +\end{itemize} + +\textbf{4. Scoring System:} +\begin{itemize} + \item How should different metrics be weighted? + \begin{itemize} + \item Relative importance of different factors + \item Task-specific prioritization + \item Business requirements alignment + \end{itemize} + \item Should scores be normalized or absolute? + \item How to handle failed responses? + \item Should we consider confidence scores from the LLMs? +\end{itemize} + +\textbf{5. Leaderboard/Ranking:} +\begin{itemize} + \item How often should rankings be updated? + \item Should ranking include confidence intervals? + \item How to handle ties or very close scores? + \item Should we maintain separate rankings for different: + \begin{itemize} + \item Business requirements + \item Model Cost Tiers + \item LLM Model Families + \end{itemize} +\end{itemize} + +Holistically, your evaluation design should be built with scalability in mind to handle growing evaluation needs as the combination of (Input Examples X LLM Applications X Evaluators X Scores X Leaderboards) may grow very fast, particularly for an organization that promotes rapid experimentation and iterative development (good properties!). Finally, one should keep in mind that the evaluation system itself requires validation to confirm its accuracy and reliability vis-a-vis business requirements (evaluating evaluators will be later discussed in this Chapter). +\section{Metrics} + +The choice of metric depends on the specific task and desired evaluation criteria. However, one can categorize metrics into two broad categories: \textbf{intrinsic} and \textbf{extrinsic}. + +\begin{itemize} + \item \textbf{Intrinsic metrics} focus on the model's performance on its primary training objective, which is typically to predict the next token in a sequence. Perplexity is a common intrinsic metric that measures how well the model predicts a given sample of text. + + \item \textbf{Extrinsic metrics} assess the model's performance on various downstream tasks, which can range from question answering to code generation. These metrics are not directly tied to the training objective, but they provide valuable insights into the model's ability to generalize to real-world applications. +\end{itemize} + +Here, we are particularly interested in extrinsic metrics, since we are evaluating LLM-based applications rather than base LLM models. + +Another way to think about metrics is in terms of the type of the task we evaluate: + +\begin{enumerate} + \item \textbf{Discriminative Task}: + \begin{itemize} + \item Involves distinguishing or classifying between existing data points. + \item Examples: Sentiment analysis, classification, or identifying whether a statement is true or false. + \end{itemize} + + \item \textbf{Generative Task}: + \begin{itemize} + \item Involves creating or producing new data or outputs. + \item Examples: Text generation, image synthesis, or summarization. + \end{itemize} +\end{enumerate} +For discriminative tasks, LLM-based applications may produce log-probabilities or discrete predictions, traditional machine learning metrics like accuracy, precision, recall, and F1 score can be applied. However, generative tasks may output text or images which require different evaluation approaches. + +For generative tasks, a range of specialized metrics should be considered. These include match-based metrics such as exact match and prefix match, as well as metrics designed specifically for tasks like summarization and translation, including ROUGE, BLEU, and character n-gram comparisons. The selection of appropriate metrics should align with the specific requirements and characteristics of the task being evaluated. + +In Table~\ref{tab:key-metrics} we provide a short list of widely used extrinsic metrics that can be used to evaluate generative tasks of LLM-based applications, along with their definitions, use cases, and limitations. +\begin{table}[htbp] +\caption{Key Metrics for Evaluating Generative Tasks} +\label{tab:key-metrics} +\begin{tabular}{p{0.2\textwidth}p{0.25\textwidth}p{0.25\textwidth}p{0.25\textwidth}} +\hline +\textbf{Metric} & \textbf{Definition} & \textbf{Use Case} & \textbf{Limitations} \\ +\hline +\textbf{BLEU} (Bilingual Evaluation Understudy) & Measures overlap of n-grams between generated text and reference text & Machine translation and text summarization & \begin{itemize}\item Favors short outputs due to brevity penalty\item Insensitive to semantic meaning\item Requires high-quality reference texts\end{itemize} \\ +\hline +\textbf{ROUGE} (Recall-Oriented Understudy for Gisting Evaluation) & Measures overlap between n-grams, words, or sentences of generated text and references, focusing on recall & Text summarization tasks & \begin{itemize}\item Biases toward long outputs\item Ignores semantic equivalence\item Heavily influenced by reference quality\end{itemize} \\ +\hline +\textbf{METEOR} (Metric for Evaluation of Translation with Explicit ORdering) & Considers synonyms, stemming, and paraphrases alongside n-gram overlap & Machine translation, where semantic equivalence matters & \begin{itemize}\item Computationally expensive\item Subjective design of synonym/stemming databases\end{itemize} \\ +\hline +\textbf{CIDEr} (Consensus-based Image Description Evaluation) & Measures n-gram overlap weighted by TF-IDF, tailored for image captioning & Image caption generation & \begin{itemize}\item Limited applicability outside captioning\item Heavily reliant on corpus statistics\end{itemize} \\ +\hline +\textbf{TER} (Translation Edit Rate) & Computes number of edits needed to convert hypothesis into reference text & Translation quality evaluation & \begin{itemize}\item Doesn't consider semantic correctness\item Penalizes valid paraphrasing\end{itemize} \\ +\hline +\textbf{BERTScore} & Uses contextual embeddings from pre-trained BERT to calculate token similarity & Tasks requiring semantic equivalence & \begin{itemize}\item High computational cost\item Performance varies with model used\end{itemize} \\ +\hline +\textbf{SPICE} (Semantic Propositional Image Caption Evaluation) & Focuses on scene graphs in image captions to evaluate semantic content & Image captioning with emphasis on semantic accuracy & \begin{itemize}\item Designed only for image captions\item Less effective in purely textual tasks\end{itemize} \\ +\hline +\end{tabular} +\end{table} +A common use case for metrics like BLEU and ROUGE is to evaluate the quality of generated summaries against reference summaries. To demonstrate this, consider the task of evaluating Financial Filings summaries against reference summaries (such as analyst-prepared highlights). + +The simple metrics-based evaluator consists of several key components: +\begin{itemize} + \item \textbf{Input:} Generated summary and reference summary + \item \textbf{Output:} Dictionary containing scores for BLEU, ROUGE\_1, and ROUGE\_2 + \item \textbf{Purpose:} Evaluation of an LLM-based Financial Filings summary generator +\end{itemize} + +A \textit{Reference Summary} represents the ``ideal'' summary, which may be prepared by human experts like analysts or generated by machines. + +For this evaluation, the focus lies on comparing summaries generated by different LLM models of varying sizes and costs against a benchmark model. The evaluation setup uses: + +\begin{itemize} + \item \textbf{Benchmark model:} \texttt{gpt-4o} + \item \textbf{Test models:} \texttt{gpt-4o-mini}, \texttt{gpt-4-turbo}, \texttt{gpt-3.5-turbo} +\end{itemize} + +The core of the evaluation system is the \texttt{evaluate\_summaries} function, which calculates BLEU and ROUGE scores to assess text generation quality. This function accepts a generated summary and reference summary as input, processes them, and returns a dictionary containing three key metrics: +\begin{itemize} + \item BLEU (measuring n-gram overlap) + \item ROUGE\_1 (unigram comparison) + \item ROUGE\_2 (bigram comparison) +\end{itemize} + +This enables quantitative comparison of generated summaries against reference texts. The implementation utilizes HuggingFace's \texttt{evaluate} library to load and compute these metrics. +First, install the required Python packages: + +\begin{minted}{bash} +pip install evaluate absl-py rouge_score +\end{minted} + +The core evaluation function is implemented as follows: + +\begin{minted}{python} +import evaluate +def evaluate_summaries(generated_summary, reference_summary): + """ + Evaluate generated summaries against reference summaries using multiple metrics. + + Args: + generated_summary (str): The summary generated by the model + reference_summary (str): The reference/ground truth summary + + Returns: + dict: Dictionary containing scores for different metrics + """ + # Initialize metrics + bleu = evaluate.load("google_bleu") + rouge = evaluate.load("rouge") + + # Format inputs for BLEU (expects list of str for predictions and list of list of str for references) + predictions = [generated_summary] + references = [reference_summary] + + # Compute BLEU score + bleu_score = bleu.compute(predictions=predictions, references=[references]) + + # Compute ROUGE scores + rouge_score = rouge.compute(predictions=predictions, references=references) + + # Compute Character metric + # Combine all scores into a single dictionary + scores = { + 'bleu': bleu_score["google_bleu"], + 'rouge1': rouge_score['rouge1'], + 'rouge2': rouge_score['rouge2'] + } + + return scores +\end{minted} + +For instance, \texttt{evaluate\_summaries} can be used to compare two arbitrary sentences and returns a dictionary with our chosen metrics: + +\begin{minted}{python} +sentence1 = "the cat sat on the mat" +sentence2 = "the cat ate the mat" +evaluate_summaries(sentence1, sentence2) +\end{minted} + + +\begin{verbatim} + {'bleu': 0.3333333333333333, + 'rouge1': 0.7272727272727272, + 'rouge2': 0.4444444444444445} +\end{verbatim} + + +Next, we define \texttt{generate\_summary}, our simple LLM-based SEC filing summarizer application using OpenAI's API. It takes an arbitrary \texttt{model}, and an \texttt{input} text and returns the corresponding LLM's response with a summary. + +\begin{minted}{python} +from openai import OpenAI +client = OpenAI() + +def generate_summary(model, input): + """ + Generate a summary of input using a given model + """ + TASK = "Generate a 1-liner summary of the following excerpt from an SEC filing." + + prompt = f""" + ROLE: You are an expert analyst tasked with summarizing SEC filings. + TASK: {TASK} + """ + + response = client.chat.completions.create( + model=model, + messages=[{"role": "system", "content": prompt}, + {"role": "user", "content": input}] + ) + return response.choices[0].message.content +\end{minted} +Next, we define \texttt{evaluate\_summary\_models} - our benchmark evaluator - that compares text summaries generated by different language models against a benchmark model. The function: + +\begin{enumerate} + \item Takes a benchmark model, list of test models, prompt, and input text + \item Generates a reference summary using the benchmark model and our \texttt{generate\_summary} function + \item Generates summaries from all test models using \texttt{generate\_summary} function + \item Evaluates each test model's summary against the benchmark using \texttt{evaluate\_summaries} + \item Returns evaluation results and the generated summaries +\end{enumerate} + +\begin{minted}{python} +def evaluate_summary_models(model_benchmark, models_test, input): + """ + Evaluate summaries generated by multiple models + """ + benchmark_summary = generate_summary(model_benchmark, input) + + # Generate summaries for all test models using list comprehension + model_summaries = [generate_summary(model, input) for model in models_test] + + # Evaluate each model's summary against the benchmark + evaluation_results = [evaluate_summaries(summary, benchmark_summary) for summary in model_summaries] + + return [evaluation_results, model_summaries, benchmark_summary] +\end{minted} + +We are ready to run our benchmark evaluation. We define a benchmark model and a list of test models and then evaluate each test model's summary against the benchmark. We also print the generated summaries for each model. + +\begin{minted}{python} +model_benchmark = "gpt-4o" +models_test = ["gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"] + +evals, model_summaries, benchmark_summary = evaluate_summary_models(model_benchmark, models_test, sec_filing) + +print(benchmark_summary) +\end{minted} + + + + \begin{verbatim} +Apple Inc.'s 10-K filing for the fiscal year ending September 28, 2024, outlines its operational and financial condition, detailing the company's diverse product lines, market activities, and compliance with SEC requirements. + \end{verbatim} + + + +\begin{minted}{python} +# Print each model name and its summary +for model, summary in zip(models_test, model_summaries): + print(f"{model}: \n {summary} \n---------------") +\end{minted} + +\begin{verbatim} +gpt-4o-mini: + Apple Inc. filed its Annual Report on Form 10-K for the fiscal year ending September 28, 2024, detailing its business operations, risks, and financial condition. +--------------- +gpt-4-turbo: + Apple Inc.'s Form 10-K for the fiscal year ended September 28, 2024, details its annual report as a well-known seasoned issuer, confirming compliance with SEC regulations and reporting on stock performances, securities, and corporate governance, while also including forward-looking statements subject to various risks. +--------------- +gpt-3.5-turbo: + Apple Inc. filed its Form 10-K with the SEC, revealing financial information for the fiscal year ended September 28, 2024, including details on its products and market performance. +--------------- +\end{verbatim} + + +The benchmark summary from \texttt{gpt-4o} provides a balanced overview of the analyzed excerpt from Apple's 10-K filing, focusing on operational status, financial condition, product lines, and regulatory compliance. + +When comparing our test models against the benchmark, we observe that \texttt{gpt-4o-mini} provides a concise yet comprehensive summary that closely aligns with the benchmark's core message. While it omits product lines, it effectively captures the essential elements of the filing including business operations, risks, and financial condition. Its brevity and focus look (subjectively) similar to our benchmark model. + +\texttt{gpt-4-turbo} performs adequately but tends toward verbosity. While it includes relevant information about SEC compliance, it introduces peripheral details about seasoned issuer status and forward-looking statements. The additional complexity makes the summary less focused than \texttt{gpt-4o-mini}'s version. + +\texttt{gpt-3.5-turbo} looks quite different from the benchmark. Its summary, while factually correct, is overly simplified and misses key aspects of the filing. The model captures basic financial information but fails to convey the breadth of operational and compliance details present in the benchmark summary. + +Of course, the above evaluation is only based on a single example and is heavily subjective. It's a ``vibe check'' on our evaluation results. Now, for an objective analysis, we can look at the quantitative metrics we have chosen and use the \texttt{visualize\_prompt\_comparison} function we write below to visualize the performance of our test models across our predefined quantitative metrics. + +\begin{minted}{bash} +pip install matplotlib +\end{minted} + +\begin{minted}{python} +def visualize_prompt_comparison(evaluation_results, model_names): + """ + Create a radar plot comparing different prompt variations + + Args: + evaluation_results (list): List of dictionaries containing evaluation metrics + model_names (list): List of names for each prompt variation + """ + from evaluate.visualization import radar_plot + + # Format data for visualization + plot = radar_plot(data=evaluation_results, model_names=model_names) + return plot + +# Create and display visualization +plot = visualize_prompt_comparison(evals, models_test) +plot.show() +\end{minted} + + +\begin{figure}[h] +\centering +\includegraphics[width=0.8\textwidth]{evals/evals_30_1.png} +\caption{Radar plot comparing model performance across evaluation metrics} +\label{fig:model_comparison} +\end{figure} + +Results demonstrate that tested models perform quite differently on our predefined metrics. The evaluation metrics puts \texttt{gpt-4o-mini} as the closest aligned to the benchmark, followed by \texttt{gpt-4-turbo}, and \texttt{gpt-3.5-turbo} showing the largest deviation. This suggests that \texttt{gpt-4o-mini} is the best model for this task at least on the metrics we have chosen and for the set of models we have tested. + +While evaluating language model outputs inherently involves subjective judgment, establishing a high-quality benchmark model and using quantifiable metrics provide a more objective framework for comparing model performance. This approach transforms an otherwise qualitative assessment into a measurable, data-driven evaluation process. + +These metrics provide quantifiable measures of performance, however limitations should be mentioned: + +\begin{itemize} + \item \textbf{Task-specific nature}: Chosen set of metrics might not fully capture the nuances of complex generative-based tasks, especially those involving subjective human judgment. + \item \textbf{Sensitivity to data distribution}: Performance on these metrics can be influenced by the specific dataset used for evaluation, which might not represent real-world data distribution. + \item \textbf{Subjective Acceptable Threshold}: These metrics are not always easy to interpret and set a threshold for (see \sidecite{sarmah2024choosethresholdevaluationmetric} for a discussion on how to choose a threshold for an evaluation metric for large language models). + \item \textbf{Inability to assess reasoning or factual accuracy}: These metrics primarily focus on surface-level matching and might not reveal the underlying reasoning process of the LLM or its ability to generate factually correct information. +\end{itemize} + +In conclusion, selecting an appropriate extrinsic metrics set depends on the specific task, underlying business requirements and desired evaluation granularity. Understanding the limitations of these metrics can provide a more comprehensive assessment of LLM performance in real-world applications. + +To address these limitations, alternative approaches like \textbf{human-based evaluation} and \textbf{model-based evaluation} are often used, which will be discussed in the following sections. +\section{Evaluators} + +\subsection{Model-Based Evaluation} +\label{sec:model-based-eval} +Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a ``Model-based evaluation'' approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment. + +As discussed in the paper \sidecite{li2024leveraginglargelanguagemodels}, LLM-based evaluation approaches generally fall into two main categories: + +\begin{enumerate} + \item \textbf{Prompt-based evaluation}: This involves using prompts to instruct existing LLMs to evaluate text quality without any fine-tuning. The evaluation can take several forms: + \begin{itemize} + \item Score-based: LLMs assign numerical scores to generated text + \item Probability-based: Using generation probability as a quality metric + \item Likert-style: Rating text quality on discrete scales + \item Pairwise comparison: Directly comparing two texts + \item Ensemble methods: Combining multiple LLM evaluators + \end{itemize} + \item \textbf{Tuning-based evaluation}: This involves fine-tuning open-source LLMs specifically for evaluation tasks. This can be more cost-effective than repeatedly using API calls and allows for domain adaptation. +\end{enumerate} + +Once you have chosen your approach, a general LLM-as-a-Judge procedure involves the following steps (see Figure~\ref{fig:llm_judge}): +\begin{enumerate} + \item \textbf{Define Evaluation Criteria}: Establish clear benchmarks, such as relevance, coherence, accuracy, and fluency. + \item \textbf{Prepare Prompts}: Craft effective prompts to guide the LLM in evaluating content against the criteria. + \item \textbf{Define Reference Data}: Establish a set of reference data that the judge model can use to evaluate the generated outputs. (\textit{Optional}) + \item \textbf{Run Evaluations}: Use the judge model to score outputs. Consider using a large and/or more capable model as a judge to provide more nuanced assessments. + \item \textbf{Aggregate and Analyze Results}: Interpret scores to refine applications. +\end{enumerate} +\begin{figure}[h] +\centering +\includesvg[width=0.6\textwidth]{evals/llm_judge.svg} +\caption{Conceptual overview of LLM-as-a-Judge evaluation.} +\label{fig:llm_judge} +\end{figure} + +Compared to traditional metrics, LLM-as-a-Judge evaluation offers a more sophisticated assessment framework by leveraging natural language criteria. While metrics focus on statistical measures, judge models excel at evaluating subjective qualities such as creativity, narrative flow, and contextual relevance - aspects that closely mirror human judgment. The judge model processes evaluation guidelines expressed in natural language, functioning similarly to a human reviewer interpreting assessment criteria. One notable consideration is that this approach requires careful prompt engineering to properly define and communicate the evaluation standards to the model. + +Prompt Engineering can have a large impact on the quality of the evaluation \sidecite{li2024leveraginglargelanguagemodels}. Hence, it's worth noting key prompting best practices when designing LLM-as-a-judge evaluators \sidecite{huggingface2024llmjudge}: +\begin{enumerate} + \item Use discrete integer scales (e.g., 1-5) rather than continuous ranges + \item Provide clear rubrics that define what each score level means + \item Include reference answers when available to ground the evaluation + \item Break down complex judgments into specific evaluation criteria +\end{enumerate} + +Additionally, the interpretability of the evaluation framework can be fostered by: +\begin{enumerate} + \item Requiring explanations and reasoning for scores to increase transparency + \item Having a hollistic evaluation by considering multiple dimensions such as coherence, relevance, and fluency +\end{enumerate} +Below we provide a sample implementation of an LLM-as-a-Judge evaluation system for our LLM application that generates SEC filing summaries. The code defines: + +\begin{enumerate} + \item A \texttt{JudgeEvaluation} Pydantic model that enforces type validation for four key metrics: + \begin{itemize} + \item \textbf{Expertise}: Rating of analyst-level writing quality + \item \textbf{Coherence}: Score for logical organization + \item \textbf{Fluency}: Assessment of grammar and clarity + \item \textbf{Similarity}: Measure of alignment with reference text + \end{itemize} + + \item An \texttt{evaluate\_with\_llm()} function that: + \begin{itemize} + \item Takes a judge model, candidate summary, and reference summary as inputs + \item Constructs a detailed prompt instructing the LLM to act as an expert evaluator + \item Uses structured output parsing to return scores in a consistent format + \item Returns scores on a 1-10 scale for each evaluation criterion + \end{itemize} +\end{enumerate} + +The implementation demonstrates how to combine structured data validation with natural language evaluation to create a robust automated assessment system. + +\begin{minted}{python} +from pydantic import BaseModel +from typing import List, Dict + +class JudgeEvaluation(BaseModel): + expertise: int + coherence: int + fluency: int + similarity: int +def evaluate_with_llm(judge_model: str, candidate_summary: str, reference_summary: str) -> Dict[str, float]: + """ + Use an LLM to evaluate a candidate summary against a reference summary. + + Args: + judge_model (str): Name of the model to use as the judge. + candidate_summary (str): Generated summary to evaluate. + reference_summary (str): Ground truth or benchmark summary. + + Returns: + dict: Dictionary containing evaluation scores for specified criteria. + """ + prompt = f""" + ROLE: You are an expert evaluator of SEC Filing summaries. Evaluate the following candidate summary against the reference summary on a scale of 1 to 10 for the following criteria: + - Expertise: Does the summary look like it was written by an expert analyst? + - Coherence: Is the candidate summary logically organized and easy to understand? + - Fluency: Is the language of the candidate summary clear and grammatically correct? + - Similarity: How similar is the candidate summary compared to the reference summary? + + Reference Summary: + "{reference_summary}" + + Candidate Summary: + "{candidate_summary}" + + Provide scores in this format: + Expertise: X, Coherence: Y, Fluency: Z, Similarity: W + """ + completion = client.beta.chat.completions.parse( + model=judge_model, + messages=[{"role": "system", "content": prompt}], + response_format=JudgeEvaluation + ) + return completion.choices[0].message.parsed +\end{minted} +Next, we define an \texttt{evaluate\_summary\_models} function that leverages our LLM-as-a-Judge function to compare summaries generated by different language models. The function works in three steps: + +First, it generates a benchmark summary using the specified benchmark model. Then, it generates summaries using each of the test models. Finally, it evaluates each test model's summary against the benchmark using the judge model. + +As a result, we get a list of evaluation results we can use to compare our candidate LLM models across our predefined metrics. + +\begin{minted}{python} +def evaluate_summary_models(judge_model: str, benchmark_model: str, test_models: List[str], input_text: str): + """ + Evaluate summaries generated by multiple models using an LLM-as-a-Judge approach. + + Args: + judge_model (str): Name of the model to use as the judge. + benchmark_model (str): Name of the benchmark model. + test_models (list): List of model names to test. + input_text (str): Input text for summarization. + + Returns: + tuple: Evaluation results, model summaries, benchmark summary. + """ + benchmark_summary = generate_summary(benchmark_model, input_text) + model_summaries = [generate_summary(model, input_text) for model in test_models] + + evaluation_results = [ + evaluate_with_llm(judge_model, summary, benchmark_summary) + for summary in model_summaries + ] + + return evaluation_results, model_summaries, benchmark_summary +\end{minted} + +\begin{minted}{python} +# Example Usage +model_benchmark = "gpt-4o" +models_test = ["gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"] +judge_model = "gpt-4o" + +evals, model_summaries, benchmark_summary = evaluate_summary_models( + judge_model, model_benchmark, models_test, sec_filing +) +\end{minted} +Here, we can see the benchmark summary coming from our benchmark model \texttt{gpt-4o}: + +\begin{minted}{python} +benchmark_summary +\end{minted} + + + \begin{verbatim} + "Apple Inc.'s annual report for the fiscal year ending September 28, 2024, details its business operations, financial condition, and product lines, including iPhones, Macs, iPads, and wearables, and incorporates forward-looking statements regarding its future performance." + \end{verbatim} + + +Next, we obtain the summaries and evaluation results generated by our test models, \texttt{gpt-4o-mini}, \texttt{gpt-4-turbo} and \texttt{gpt-3.5-turbo}, respectively. + +\begin{minted}{python} +model_summaries +\end{minted} + + + + \begin{verbatim} + ['Apple Inc. filed its annual Form 10-K report for the fiscal year ended September 28, 2024, detailing its business operations, product lines, and financial performance.', + "This Form 10-K filing by Apple Inc. for the fiscal year ended September 28, 2024, is an annual report detailing the company's financial performance, including registered securities, compliance with SEC reporting standards, and contains sections on business operations, risk factors, financial data, and management analysis.", + 'Apple Inc., a California-based technology company, reported an aggregate market value of approximately $2.6 trillion held by non-affiliates, with 15.1 billion shares of common stock outstanding as of October 18, 2024.'] + \end{verbatim} + + +As a result, we obtain a list of objects from the defined \texttt{JudgeEvaluation} Pydantic class which contains the evaluation metrics: expertise, coherence, fluency and similarity. + +\begin{minted}{python} +evals +\end{minted} + +\begin{verbatim} +[JudgeEvaluation(expertise=7, coherence=8, fluency=8, similarity=7), + JudgeEvaluation(expertise=7, coherence=7, fluency=8, similarity=5), + JudgeEvaluation(expertise=4, coherence=5, fluency=7, similarity=2)] +\end{verbatim} + + +\begin{minted}{python} +# Convert evaluation objects to dictionaries +evals_list = [ + { + "expertise": eval.expertise, + "coherence": eval.coherence, + "fluency": eval.fluency, + "similarity": eval.similarity + } + for eval in evals +] + +# Visualize results +plot = visualize_prompt_comparison(evals_list, models_test) +plot.show() +\end{minted} + + + +\begin{figure}[h] +\centering +\includegraphics{evals/evals_46_1.png} +\caption{Evaluation metrics comparison across test models} +\label{fig:eval-comparison} +\end{figure} + +Analyzing the evaluation results across our test models (\texttt{gpt-4o-mini}, \texttt{gpt-4-turbo}, \texttt{gpt-3.5-turbo}), several interesting patterns emerge: + +The \texttt{gpt-4o-mini} model demonstrated strong performance, achieving high scores across all metrics (expertise: 7, coherence: 8, fluency: 8, similarity: 7). This performance suggests it maintained robust quality despite being a smaller variant of our benchmark model \texttt{gpt-4o}. + +The \texttt{gpt-4-turbo} model exhibited comparable expertise and fluency scores (7 and 8 respectively) but showed slightly reduced coherence (7) and notably lower similarity (5) compared to the benchmark. These results could indicate some divergence from the reference summary while maintaining overall quality. + +The \texttt{gpt-3.5-turbo} model recorded the lowest scores (expertise: 4, coherence: 5, fluency: 7, similarity: 2), with particular weaknesses in expertise and similarity to the benchmark. While maintaining acceptable fluency, the marked decrease in similarity score indicates substantial deviation from the reference summary. + +Figure \ref{fig:eval-comparison} illustrates these differences across models and evaluation dimensions. A distinct performance gradient can be observed from \texttt{gpt-4o-mini} to \texttt{gpt-3.5-turbo}, with the latter exhibiting significant degradation across most metrics. +Leveraging LLMs for evaluation has several limitations \sidecite{li2024leveraginglargelanguagemodels}. Firstly, computational overhead should not be neglected given the inherent cost of running additional model inferences iterations. LLM evaluators can also exhibit various biases, including order bias (preferring certain sequence positions), egocentric bias (favoring outputs from similar models), and length bias. Further, there may be a tight dependency on prompt quality - small prompt variations may lead to substantially different outcomes. It is important to also note challenges around domain-specific evaluation in fields such as medicine, finance, law etc, where a general llm-as-a-judge approach may not be suitable. + +The LLM-as-a-Judge strategy can serve as a scalable and nuanced solution to evaluate LLM-based applications. While it does not entirely replace metrics-based or human-based approaches, it significantly augments evaluation workflows, especially in scenarios requiring evaluation of generative outputs. Future improvements in our example include integrating human oversight and refining LLMs for domain-specific evaluation tasks. + +One open source solution trying to overcome some of these challenges is Glider \sidecite{deshpande2024glidergradingllminteractions}, a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show $91.3\%$ agreement with human judgments, making it suitable for a diverse range of real world applications. + +\section{Evaluating Evaluators} + +We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options: + +\begin{enumerate} + \item Use a golden-standard dataset that is used to evaluate the performance of LLM evaluators using a ``metrics-based'' approach. + \item Use a human evaluator to generate reference scores that can be used to evaluate the performance of the LLM evaluator (similar to the human-based evaluation we discussed earlier). +\end{enumerate} +As depicted in Figure~\ref{fig:meta}, the performance of the LLM evaluator can be evaluated by comparing its scores to either a golden-standard dataset or human reference scores. Higher correlation values indicate better performance of the LLM evaluator. For instance, if we were to evaluate the performance of a LLM-as-a-judge evaluator, in the task of evaluating multilingual capability of an LLM: +\begin{enumerate} + \item In a ``metrics-based'' approach, we would first need to define a set of metrics that capture the task of multilingual capability. For instance, we could use the BLEU metric to evaluate the quality of the generated LLM output against a golden dataset (e.g. machine translated text). We would then calculate the correlation between these scores against those generated by the LLM evaluator. The higher the correlation, the better the LLM evaluator. + \item In a ``human-based'' approach, we would need to recruit human evaluators that are experts in the target languages we are evaluating. Expert humans would provide scores for a set of samples of the input LLM. We would then calculate the correlation between these scores against those generated by the LLM evaluator. The higher the correlation, the better the LLM evaluator. +\end{enumerate} + +\begin{figure}[h] +\centering +\includegraphics{evals/meta.png} +\caption{Conceptual overview of LLMs Meta Evaluation} +\label{fig:meta} +\end{figure} +An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is Judge Arena \sidecite{judgearena2024}, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in Figure~\ref{fig:meta2}. Only after submitting a vote, users can see which models were actually doing the judging. + +\begin{figure}[h] +\centering +\includegraphics{evals/meta2.png} +\caption{Human-in-the-loop Meta Evaluation} +\label{fig:meta2} +\end{figure} + +The LLM input and its prompt are displayed to the human evaluator and are customizable enabling task-specific meta evaluation. Further, the Judge Arena's LLM Judge's prompt is also editable by the user. Its default prompt is presented below: +\begin{quote} +Does the model provide relevant and useful responses to the user's needs or questions? + +\textbf{Scoring Rubric:} + +Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries. + +Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions. + +Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark. + +Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies. + +Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries. +\end{quote} + +Judge Arena's approach and policy framework has three key benefits worth highlighting: +\begin{enumerate} + \item Transparency through open-source code, documentation, and data sharing + \item LLM inclusion criteria requiring scoring/critique capabilities and public accessibility + \item ELO-based leaderboard system with community involvement in evaluations +\end{enumerate} + +In that way, the platform enables democratic evaluation of AI judges while maintaining transparency and accessibility standards. + +\section{Benchmarks and Leaderboards} + +Benchmarks act as standardized tests for LLMs, evaluating their performance across a spectrum of tasks. These tasks simulate real-world applications such as answering questions, generating coherent text, solving mathematical problems, or even writing computer code. They also assess more abstract qualities like fairness, robustness, and cultural understanding. + +Benchmarks can be thought as comprehensive ``exams'' that probe different ``subjects'' in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication. + +The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. We can start in 2018 with the introduction of \textbf{GLUE} (General Language Understanding Evaluation) \sidecite{wang2019gluemultitaskbenchmarkanalysis}, which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. Later, \textbf{SuperGLUE} \sidecite{nangia2019superglue} expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors. + +As AI capabilities grew, benchmarks evolved to capture broader and more diverse aspects of intelligence. \textbf{BIG-Bench} \sidecite{srivastava2023imitationgamequantifyingextrapolating} marked a turning point by incorporating over 200 tasks, spanning arithmetic, logic, and creative problem-solving. This collaborative effort aimed to probe emergent abilities in large models, offering insights into how scale and complexity influence performance. Around the same time, specialized benchmarks like \textbf{TruthfulQA} \sidecite{2021truthfulqa} emerged, addressing the critical need for models to provide accurate and non-deceptive information in a world increasingly dependent on AI for factual content. + +\textbf{MMLU} (Massive Multitask Language Understanding) \sidecite{hendrycks2021measuringmassivemultitasklanguage} launched in 2021, provided a rigorous test of a model's multidisciplinary knowledge, covering 57 subjects from STEM fields to humanities and social sciences. Similarly, in 2022, Stanford's \textbf{HELM} (Holistic Evaluation of Language Models) \sidecite{liang2023holisticevaluationlanguagemodels} set a new standard for multidimensional assessment. HELM expanded the scope of evaluation beyond accuracy, incorporating factors like fairness, robustness, and computational efficiency. This benchmark was designed to address societal concerns surrounding AI, emphasizing safety and inclusion alongside technical performance. + +Specialized benchmarks like \textbf{HumanEval} (2021) \sidecite{chen2021evaluatinglargelanguagemodels} focused on domain-specific tasks, such as code generation, testing models' ability to translate natural language descriptions into functional programming code. In contrast, \textbf{LMSYS} (2023) brought real-world applicability into focus by evaluating conversational AI through multi-turn dialogues. LMSYS prioritized coherence, contextual understanding, and user satisfaction, providing a practical lens for assessing models like GPT and Claude in dynamic settings. + +The \textbf{HuggingFace Open LLM} \sidecite{openllmleaderboard2024} Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals. + +The \textbf{Chatbot Arena} (2024) Leaderboard (an evolution of LMSYS) \sidecite{chiang2024chatbotarenaopenplatform} takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered hundreds of thousands of human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots. + +The \textbf{AlpacaEval} \sidecite{dubois2024lengthcontrolledalpacaevalsimpleway} and \textbf{MT-Bench} \sidecite{zheng2023judgingllmasajudgemtbenchchatbot} Leaderboards implement automated evaluation using LLMs to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges. + +An important recent development was the release of Global-MMLU \sidecite{singh2024globalmmluunderstandingaddressing}, an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Hundreds of contributors used Argilla to annotate MMLU questions, revealing that $85\%$ of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts. + +A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models' training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. \textbf{LiveBench} \sidecite{white2024livebenchchallengingcontaminationfreellm} represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving considerably lower performance compared to other benchmarks, demonstrating LiveBench's ability to meaningfully differentiate model capabilities with relatively lower saturation. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances. + +Another notable benchmark is ZebraLogic \sidecite{zebralogic2024}, which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem \sidecite{brailsford1999constraint} commonly found in tests like the LSAT. These puzzles require assigning unique values to $N$ houses across $M$ different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark's programmatically generated puzzles range from $2\times2$ to $6\times6$ in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves $33.4\%$ accuracy across all puzzles and $12.4\%$ on hard puzzles, with smaller models (7-10B parameters) solving less than $1\%$ of hard puzzles as of December 2024. These results reveal critical gaps in LLMs' capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization. + +A significant milestone in AI evaluation came with the launch of the \textbf{The Alignment Research Center (ARC) Prize} \sidecite{arcprize2024} by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls ``cognitive sufficiency'' - a model's ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge. Arguably, it is an attempt to define and measure a step towards what it means to achieve AGI (Artificial General Intelligence). + +\begin{quote} +\textbf{Defining AGI according to ARC Prize:} + +Consensus but wrong: +\begin{itemize} + \item AGI is a system that can automate the majority of economically valuable work. +\end{itemize} + +Correct: +\begin{itemize} + \item AGI is a system that can efficiently acquire new skills and solve open-ended problems. +\end{itemize} +\end{quote} + +The ARC benchmark distinguishes itself from other LLM benchmarks especially in its resistance to memorization by prioritizing: + +\begin{itemize} + \item \textbf{Focus on Core Knowledge:} Unlike LLM benchmarks that test a broad range of knowledge and skills, often relying heavily on memorization, ARC focuses on core knowledge similar to what a four or five-year-old child might possess. This includes basic concepts like object recognition, counting, and elementary physics. + + \item \textbf{Novelty of Tasks:} Each ARC puzzle is designed to be novel, meaning it's something you likely wouldn't have encountered before, even if you had memorized the entire internet. This characteristic directly challenges the way LLMs typically operate, which is by leveraging their vast ``interpolative memory.'' + + \item \textbf{Emphasis on Program Synthesis:} ARC tasks require models to synthesize new solution programs on the fly for each unique puzzle. This stands in contrast to the more common LLM approach of retrieving pre-existing solution programs from memory. + + \item \textbf{Resistance to Brute Force Attempts:} While acknowledging the possibility, ARC aims to be resistant to brute-force approaches where a model might be trained on millions of similar puzzles to achieve a high score by relying on overlap with the test set. +\end{itemize} + +ARC-AGI tasks are a series of three to five input and output tasks followed by a final task with only the input listed (e.g. Figure~\ref{arc}). Each task tests the utilization of a specific learned skill based on a minimal number of cognitive priors. A successful submission is a pixel-perfect description (color and position) of the final task's output. + +\begin{figure}[h] +\centering +\includegraphics[scale=0.5]{evals/arc.png} +\label{arc} +\end{figure} +These features make the ARC benchmark a unique test of machine intelligence, focusing on the ability to adapt to novelty and solve problems without relying heavily on memorization. This is more aligned with the concept of general intelligence, which emphasizes the ability to learn efficiently and tackle new challenges. + +The ARC-AGI benchmark remained unbeaten for five years as of December 2024 (a minimum score of $85\%$ in the private dataset is required to win) \sidecite{arcprizeresults2024}. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark. + +In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including: +\begin{itemize} + \item \textbf{FinBench} \sidecite{zhang2024finbench}: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling. + \item \textbf{LegalBench} \sidecite{guha2023legalbench}: Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals + \item \textbf{Berkeley Function Leaderboard (BFCL)} \sidecite{patil2023gorilla}: Evaluates LLMs' function-calling abilities +\end{itemize} + +As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren't previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks. + +In the following sections, we will explore some open source tools developers can use to automate and streamline the challenging task of LLMs evals. +\section{Tools} + +\subsection{LightEval} + +LightEval \sidecite{lighteval} is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI. + +As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let's evaluate how well different models perform on this type of task. + +First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. Table~\ref{mmlu-econometrics} shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers. + +\begin{table*}[h] +\caption{MMLU Econometrics Task Dataset sample} +\label{mmlu-econometrics} +\begin{tabular}{p{0.3\textwidth}p{0.3\textwidth}p{0.15\textwidth}p{0.1\textwidth}p{0.15\textwidth}} +\hline +Question & Options & Correct Options & Index & Literal \\ +\hline +Consider the following AR(1) model with the disturbances having zero mean and unit variance: $y_t = 0.2 + 0.4 y_{t-1} + u_t$ The (unconditional) mean of y will be given by & ["0.2", "0.4", "0.5", "0.33"] & ["b"] & [3] & ["0.33"] \\ +\hline +Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true? (i) If the size of the test were exactly 8\%, we... & ["(ii) and (iv) only", "(i) and (iii) only", "(i), (ii), and (iii) only", "(i), (ii), (iii), and (iv)"] & ["c"] & [2] & ["(i), (ii), and (iii) only"] \\ +\hline +What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored? & ["It will be biased", "It will be inconsistent", "It will be inefficient", "All of (a), (b) and (c) will be true."] & ["c"] & [2] & ["It will be inefficient"] \\ +\hline +Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bivariate VAR... & ["1 lag", "2 lags", "3 lags", "4 lags"] & ["c"] & [2] & ["3 lags"] \\ +\hline +\end{tabular} +\end{table*} +The code sample below demonstrates the LightEval Python SDK framework for evaluating a target LLM model on a given task. First, we instantiate an \texttt{EvaluationTracker} which manages result storage, in this example kept in a local directory \texttt{output\_dir}, and tracks detailed evaluation metrics, optionally pushed to HuggingFace Hub. + +Next, we instantiate an object of the class \texttt{PipelineParameters} which, in this example, configures the pipeline for parallel processing with a temporary cache in \texttt{cache\_dir} also setting the maximum number of samples to process to \texttt{max\_samples}. Then, in \texttt{BaseModelConfig} we set up the LLM model we would like to evaluate defined in \texttt{pretrained}. + +\begin{minted}{bash} +pip install lighteval[accelerate] +\end{minted} +\begin{minted}{python} +import lighteval +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.models.model_config import BaseModelConfig +from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters +from lighteval.utils.utils import EnvConfig +from lighteval.utils.imports import is_accelerate_available +from datetime import timedelta +from accelerate import Accelerator, InitProcessGroupKwargs + + +def create_evaluation_pipeline(output_dir: str, cache_dir: str, pretrained: str, dtype: str = "float16", max_samples: int = 10, task: str): + if is_accelerate_available(): + from accelerate import Accelerator, InitProcessGroupKwargs + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) + else: + accelerator = None + + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=True, + push_to_hub=False + ) + + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.ACCELERATE, + env_config=EnvConfig(cache_dir=cache_dir), + override_batch_size=1, + max_samples=max_samples + ) + + model_config = BaseModelConfig( + pretrained=pretrained, + dtype=dtype, + use_chat_template=True, + trust_remote_code=True + ) + + pipeline = Pipeline( + tasks=task, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config + ) + + return pipeline +\end{minted} +Figure~\ref{fig:lighteval} shows a schematic representation of its key components. As inference engine, we leverage \texttt{accelerate} for distributed evaluation. \texttt{lighteval} also supports other inference backends such as \texttt{vllm} and \texttt{tgi}. + +\begin{figure}[h] +\centering +\includegraphics{evals/lighteval.png} +\caption{LightEval Python SDK Sample Conceptual Overview.} +\label{fig:lighteval} +\end{figure} + +This setup allows for systematic evaluation of language model performance on specific tasks while handling distributed computation and result tracking. + +The final Pipeline combines these components to evaluate in the user defined \texttt{task}, which follows the following format: + +\begin{minted}{bash} +\{suite\}|\{task\}|\{num_few_shot\}|\{0 or 1 to automatically reduce num_few_shot if prompt is too long\} +\end{minted} + +The task string format follows a specific pattern with four components separated by vertical bars ($|$): + +\begin{enumerate} +\item suite: The evaluation suite name (e.g., ``leaderboard'') +\item task: The specific task name (e.g., ``mmlu:econometrics'') +\item num\_few\_shot: The number of few-shot examples to use (e.g., ``0'' for zero-shot) +\item A binary flag (0 or 1) that controls whether to automatically reduce the number of few-shot examples if the prompt becomes too long +\end{enumerate} +LightEval provides a comprehensive set of evaluation tasks \sidecite{lighteval_tasks} and metrics \sidecite{lighteval_metrics}. The available tasks span multiple categories and benchmarks including BigBench, MMLU, TruthfulQA, WinoGrande, and HellaSwag. The framework also supports standard NLP evaluation metrics including BLEU, ROUGE, Exact Match, F1 Score, and Accuracy. + +In our case, we choose to evaluate our LLMs on the MMLU econometrics task using zero-shot learning. Hence, we define the \texttt{task} as follows: + +\begin{minted}{python} +task = "leaderboard|mmlu:econometrics|0|0" +\end{minted} + +Example usage to evaluate an LLM, for instance \texttt{meta-llama/Llama-3.2-1B-Instruct}, on the MMLU econometrics task using zero-shot learning: + +\begin{minted}{python} +task = "leaderboard|mmlu:econometrics|0|0" +model = "meta-llama/Llama-3.2-1B-Instruct" +pipeline = create_evaluation_pipeline(output_dir="./evals/", + cache_dir="./cache/", + pretrained=model, + task=task) +\end{minted} +We can then evaluate the pipeline, save and show its results as follows: + +\begin{minted}{python} +pipeline.evaluate() +pipeline.save_and_push_results() +pipeline.show_results() +\end{minted} + +The results are then stored in \texttt{output\_dir} in JSON format. + +The same results can be obtained by using the LightEval CLI: + +\begin{minted}{bash} +lighteval accelerate --model_args "pretrained=meta-llama/Llama-3.2-1B-Instruct" --tasks "leaderboard|mmlu:econometrics|0|0" --override_batch_size 1 --output_dir="./evals/" +\end{minted} +Comparing the performance of multiple open source models on the MMLU econometrics task requires careful consideration of computational resources. While local evaluation is possible, leveraging a remote server proves more efficient in terms of time and resources. LightEval facilitates this by enabling model serving on a TGI-compatible server/container and executing evaluations through server requests \sidecite{lighteval_server}. + +The HuggingFace Serverless Inference API provides an ideal solution for this purpose\sidenote{A bug was discovered in LightEval that initially prevented compatibility with the HuggingFace Serverless Inference API: \url{https://github.com/huggingface/lighteval/issues/422}. The LightEval team has since resolved this issue.}. The configuration file for LightEval should be structured as follows, where \texttt{} represents the model identifier on HuggingFace (e.g., \texttt{meta-llama/Llama-3.2-1B-Instruct}) and \texttt{} is the user's HuggingFace API token. Alternatively, a URL for a dedicated inference API can be specified if available. + +\begin{minted}{yaml} +model: + type: "tgi" + instance: + inference_server_address: "https://api-inference.huggingface.co/models/" + inference_server_auth: "" + model_id: null +\end{minted} +Now we can run the evaluation by sending requests to the server as follows by using the same bash command as before but now setting the \texttt{model\_config\_path} to the path of the configuration file we have just created (e.g. \texttt{endpoint\_model.yaml}): + +\begin{minted}{bash} +lighteval accelerate --model_config_path="endpoint_model.yaml" --tasks "leaderboard|mmlu:econometrics|0|0" --override_batch_size 1 --output_dir="./evals/" +\end{minted} + +To complete our task, we evaluate a few models from the following model families: \texttt{Llama3.2}, \texttt{Qwen2.5}, and \texttt{SmolLM2} as described in Table~\ref{tab:model-families}. + +\begin{table}[h] +\caption{Model Families Evaluated Using LightEval} +\label{tab:model-families} +\begin{tabular}{llll} +\hline +Model Family & Description & Models & References \\ +\hline +Llama3.2 Instruct & LLaMA architecture-based pretrained & \texttt{Llama-3.2-1B-Instruct} & \sidecite{meta_llama_models} \\ + & and instruction-tuned generative models & \texttt{Llama-3.2-3B-Instruct} & \\ +\hline +Qwen2.5 Instruct & Instruction-tuned LLMs family & \texttt{Qwen2.5-0.5B-Instruct} & \sidecite{gpt2docs,hui2024qwen2,qwen2} \\ + & built by Alibaba Cloud & \texttt{Qwen2.5-1.5B-Instruct} & \\ + & & \texttt{Qwen2.5-3B-Instruct} & \\ +\hline +SmolLM2 Instruct & Instruction-tuned family of compact & \texttt{SmolLM2-360M-Instruct} & \sidecite{allal2024SmolLM2} \\ + & language models built by HuggingFace & \texttt{SmolLM2-1.7B-Instruct} & \\ +\hline +\end{tabular} +\end{table} +We can then compare the performance of these models on the MMLU econometrics task as shown in Figure~\ref{fig:model-comparison}. + +\begin{figure}[h] +\centering +\includegraphics{evals/model-comparison.png} +\caption{Model performance comparison on MMLU Econometrics task, showing accuracy scores across different model sizes and architectures.} +\label{fig:model-comparison} +\end{figure} + +The results reveal several interesting patterns in model performance. As expected, we observe a trend where larger models consistently achieve higher accuracy scores. The evaluation shows distinct clusters among model families, with Qwen2.5, Llama-3.2, and SmolLM2 each exhibiting their own scaling characteristics, suggesting that architectural differences lead to varying degrees of efficiency as model size increases. Particularly noteworthy is the performance of the Qwen2.5 family, which demonstrates superior accuracy even at smaller model sizes when compared to Llama-3.2. + +Of course, the results should be taken with a grain of salt given the limited size of the dataset (MMLU Econometrics $\sim$ 100), limited number of models and sizes. However, it gives a good indication of the capabilities of the different models tested with Qwen2.5 family being an interesting first candidate as a relatively small yet powerful model demonstrating a good trade-off between performance and size. Once tested on real-world data, the results will change but these initial findings are a good data-driven starting point for model selection as you begin your LLM-based application development. + +In summary, LightEval is a simple yet flexible and comprehensive framework for evaluating LLMs across a wide variety of tasks and metrics. It can serve as a first step in selecting your next LLM for a specific task given the exponential growth in number of (open source) models available \sidecite{hf_num_models}. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the official repository\sidenote{\url{https://github.com/huggingface/lighteval}} \sidecite{lighteval}. +\subsection{LangSmith} + +Let's revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recall the setup: + +\begin{itemize} +\item Benchmark model: \texttt{gpt-4o} +\item Test models: \texttt{gpt-4o-mini}, \texttt{gpt-4-turbo}, \texttt{gpt-3.5-turbo} +\end{itemize} + +We can run evaluation using only LangSmith without the need of LangChain. + +\begin{minted}{bash} +!pip uninstall langchain +!pip uninstall langchain-community +!pip uninstall langchain-openai +!pip install langsmith +\end{minted} + +We need to generate an API key to use LangSmith. See instructions at \url{https://docs.smith.langchain.com/}. Remember to export your API\_KEY. Activating tracing will allow us to track logs and foster observability of our evaluation. + +\begin{minted}{bash} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY= +\end{minted} + +\begin{minted}{python} +import evaluate as hf_evaluate # HuggingFace's evaluate +from langsmith import evaluate as langsmith_evaluate # LangSmith's evaluate +from langsmith import Client +from typing import Dict, Any + +ls_client = Client() +\end{minted} + +The code below creates a dataset in LangSmith that will serve as our golden dataset for evaluation. The dataset consists of test cases where we create a single example with the following content: + +\begin{itemize} +\item An input: Our SEC filing document +\item An expected output: A golden summary generated by our benchmark model (\texttt{gpt-4o}) +\end{itemize} + +This dataset will allow us to evaluate how well other models perform compared to our benchmark by comparing their generated summaries against these reference summaries. In practice, it's recommended to create a larger dataset with more diverse examples to get a more accurate assessment of model capabilities as well as to estimate confidence intervals for target metrics. + +\begin{minted}{python} +# Define dataset: these are your test cases +dataset_name = "Golden SEC Summary Dataset" +dataset = ls_client.create_dataset(dataset_name) +ls_client.create_examples( + inputs=[ + {"sec_filing": sec_filing}, + ], + outputs=[ + {"summary": benchmark_summary}, + ], + dataset_id=dataset.id, +) +\end{minted} +Our Dataset is now available in LangSmith as shown in Figure~\ref{fig:langsmith_dataset}. +\begin{figure}[h] +\centering +\includegraphics{evals/langsmith_dataset.png} +\label{fig:langsmith_dataset} +\caption{LangSmith Dataset} +\end{figure} + +Next, we write our evaluator. This evaluator calculates BLEU scores between generated and reference summaries using HuggingFace's evaluate package. The evaluator takes two dictionaries as input - one containing the generated summary and another containing the reference summary. It returns a dictionary with the Google BLEU score, which measures the overlap between n-grams in the generated and reference texts similar to our previous metric-based experiments. + +\begin{minted}{python} +def calculate_scores(outputs: Dict[str, Any], reference_outputs: Dict[str, Any]) -> dict: + """ + Custom evaluator that calculates BLEU and ROUGE scores between generated and reference summaries + using HuggingFace's evaluate package + + Args: + outputs (dict): Contains the generated summary + reference_outputs (dict): Contains the reference summary + + Returns: + dict: Dictionary containing Google BLEU score + """ + generated = outputs.get("summary", "") + reference = reference_outputs.get("summary", "") + + # Initialize metrics from HuggingFace's evaluate + bleu = hf_evaluate.load("google_bleu") + + # Format inputs for BLEU (expects list of str for predictions and list of list of str for references) + predictions = [generated] + references = [reference] + + # Compute BLEU score + bleu_score = bleu.compute(predictions=predictions, references=[references]) + + return {"key": "google_bleu", "score": bleu_score["google_bleu"]} +\end{minted} +Now that we have defined our evaluation metrics, let's create a function to generate summaries for our smaller models. The function below takes a dictionary containing the SEC filing text as input and returns a dictionary with the generated summary. The prompt instructs the model to act as an expert analyst and generate a one-line summary of the filing excerpt. We use the same task and model configuration as in our previous experiments to maintain consistency in our evaluation pipeline. + +\begin{minted}{python} +from openai import OpenAI +oai_client = OpenAI() +\end{minted} + +\begin{minted}{python} +TASK = "Generate a 1-liner summary of the following excerpt from an SEC filing." + +PROMPT = f""" +ROLE: You are an expert analyst tasked with summarizing SEC filings. +TASK: {TASK} +""" + +xp_model_name = "" # model to be tested + +def generate_summary(inputs: dict): + """ + Generate a summary of input using a given model + """ + TASK = "Generate a 1-liner summary of the following excerpt from an SEC filing." + + response = oai_client.chat.completions.create( + model=xp_model_name, # model_name is a global variable + messages=[{"role": "system", "content": PROMPT}, + {"role": "user", "content": inputs.get("sec_filing")}] + ) + return {"summary": response.choices[0].message.content} +\end{minted} + +Lastly we define a function to run our evaluation. The \texttt{run\_evaluation()} function uses LangSmith's \texttt{evaluate()} to run evaluations either locally or remotely. When running locally, results are not uploaded to LangSmith's servers. The function takes an application, dataset, and list of evaluators as input and returns the evaluation results. The application is the \texttt{generate\_summary()} function we would like to evaluate. The \texttt{dataset} is the golden summary from the strong model. And we pass a list with our single evaluator \texttt{calculate\_scores()}. LangSmith also allows for running multiple repetitions of the same experiment to get a more accurate assessment of model capabilities as well as to estimate confidence intervals for target metrics, which we set to 5 repetitions. + +This allows us to systematically assess our LLM-based application while maintaining control over where results are stored. + +\begin{minted}{python} +def run_evaluation(app, model_name, dataset, evaluators, upload_results=False): + global xp_model_name + xp_model_name = model_name + results = langsmith_evaluate( + app, + client=None, + data=dataset, + evaluators=evaluators, + experiment_prefix=model_name, + num_repetitions=5, + upload_results= upload_results, # This is the key parameter for local evaluation + + ) + + return results +\end{minted} +Now we are ready run evaluation on our app across all target LLM models. + +\begin{minted}{python} +app = generate_summary +\end{minted} + +\begin{minted}{python} +models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o-mini"] +results = [run_evaluation(app, model, dataset=dataset_name, evaluators=[calculate_scores], upload_results=True) for model in models] +\end{minted} + +We can obtain the results for all experiments including the execution time and the Google BLEU score. + +\begin{minted}{python} +import pandas as pd + +# Create list of dataframes from results +dfs = [result.to_pandas() for result in results] + +for df, model in zip(dfs, models): + df.insert(0, 'model', model) + +combined_df = pd.concat(dfs, ignore_index=True) +combined_df.head() +\end{minted} + + + +\begin{table*}[h] +\centering +\begin{tabular}{lllllllll} +\hline + & model & inputs.sec\_filing & outputs.summary & error & reference.summary & feedback.google\_bleu & execution\_time & example\_id \\ +\hline +0 & gpt-3.5-turbo & UNITED STATES\textbackslash nSECURITIES... & Apple Inc.'s Form 10-K... & None & Apple Inc.'s 10-K filing... & 0.333333 & 1.224388 & feb10f92-3167-41f3... \\ +1 & gpt-3.5-turbo & UNITED STATES\textbackslash nSECURITIES... & Apple Inc. filed its Form... & None & Apple Inc.'s 10-K filing... & 0.348101 & 0.722464 & feb10f92-3167-41f3... \\ +2 & gpt-3.5-turbo & UNITED STATES\textbackslash nSECURITIES... & Apple Inc. filed its annual... & None & Apple Inc.'s 10-K filing... & 0.386076 & 0.704104 & feb10f92-3167-41f3... \\ +3 & gpt-3.5-turbo & UNITED STATES\textbackslash nSECURITIES... & Apple Inc. filed its Annual... & None & Apple Inc.'s 10-K filing... & 0.443038 & 0.725059 & feb10f92-3167-41f3... \\ +4 & gpt-3.5-turbo & UNITED STATES\textbackslash nSECURITIES... & Apple Inc. filed its Annual... & None & Apple Inc.'s 10-K filing... & 0.373418 & 0.795302 & feb10f92-3167-41f3... \\ +\hline +\end{tabular} +\caption{Evaluation Results for GPT-3.5-turbo Model} +\label{tab:eval-results} +\end{table*} + + + +```python +# Calculate statistics per model +stats = combined_df.groupby('model').agg({ + 'feedback.google_bleu': ['mean', 'std'], + 'execution_time': ['mean', 'std'] +}).round(4) + +# Sort by execution time +stats = stats.sort_values(('execution_time', 'mean')) +``` + +\begin{figure}[h] +\centering +\includegraphics{evals/evals_74_0.png} +\label{fig:eval-results} +\caption{Model Performance Comparison} +\end{figure} + + + +\begin{table}[h] +\centering +\begin{tabular}{lcccc} +\hline +\multirow{2}{*}{Model} & \multicolumn{2}{c}{Google BLEU} & \multicolumn{2}{c}{Execution Time (s)} \\ +\cline{2-5} +& Mean & Std & Mean & Std \\ +\hline +GPT-4o-mini & 0.4038 & 0.0453 & 0.7815 & 0.0433 \\ +GPT-3.5-turbo & 0.3768 & 0.0424 & 0.8343 & 0.2208 \\ +GPT-4-turbo & 0.3519 & 0.0775 & 0.9122 & 0.1482 \\ +\hline +\end{tabular} +\caption{Detailed Model Performance Statistics} +\label{tab:model-stats} +\end{table} +The evaluation results reveal notable differences between the models. GPT-3.5-turbo achieved a Google BLEU score of $0.377 \pm 0.042$ with average execution time of $0.83s \pm 0.22s$. GPT-4-turbo scored slightly lower at $0.352 \pm 0.078$ and was slower at $0.91s \pm 0.15s$. GPT-4o-mini performed best with a BLEU score of $0.404 \pm 0.045$ while being fastest at $0.78s \pm 0.04s$. + +As expected, results suggest that the newer GPT-4o-mini model achieves better quality while maintaining lower latency compared to both GPT-3.5 and GPT-4 turbo variants. The standard deviations indicate that GPT-4-turbo has the most variable output quality, while GPT-4o-mini is most consistent in both quality and speed. Interestingly, the more advanced gpt-4-turbo model has lower BLEU scores but takes longer to execute. This suggests that model size and computational complexity don't necessarily correlate with better performance on this specific summarization task. Of course, this is a very simple task further increasing the number of experiment iterations will yield more accurate results. + +Since we decided to upload result, we can also visualize the experiment results in LangSmith as shown in Figure~\ref{fig:langsmith}. + +\begin{figure}[h] +\centering +\includegraphics{evals/langsmith.png} +\label{fig:langsmith} +\caption{LangSmith Experiment Results} +\end{figure} +\subsection{PromptFoo} + +PromptFoo \sidecite{promptfoo2024} is an open-source framework designed for evaluating applications that utilize LLMs. Key features include: + +\begin{enumerate} + \item \textbf{Automated Testing}: PromptFoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications. + + \item \textbf{Custom Probes}: Developers can create custom probes to focus on specific use cases for instance decoupling prompts from tests cases. + + \item \textbf{User-Friendly CLI}: The framework features a command-line interface that supports live reloads and caching, facilitating rapid testing and iteration. +\end{enumerate} + +We will use PromptFoo's command line interface in the following examples. Please follow installation instructions at \url{https://www.promptfoo.dev/docs/installation/#for-command-line-usage}. + +Evals are defined in a configuration file \texttt{promptfooconfig.yaml}, which defines elements such as providers, prompts, test cases, and assertions. + +In the following example, we will perform a two-step evaluation: + +\begin{enumerate} + \item Evaluate the performance of different LLM models given a set of constraints. + \item Evaluate the quality of different prompts for the best performing model from 1. +\end{enumerate} + + +\begin{minted}{python} +import yaml + +# Read the YAML file +with open('promptfoo/model_comparison/promptfooconfig.yaml', 'r') as f: + config = yaml.safe_load(f) + +# Pretty print the YAML content +print(yaml.dump(config, default_flow_style=False, sort_keys=False)) +\end{minted} + +\begin{minted}{yaml} +description: Best model eval +prompts: +- file://prompt1.txt +providers: +- openai:gpt-4o-mini +- openai:gpt-4 +- openai:gpt-3.5-turbo +defaultTest: + assert: + - type: cost + threshold: 0.001 + - type: latency + threshold: 1000 + - type: python + value: len(output) < 200 + - type: llm-rubric + value: Does the summary look like it was written by an expert analyst [Yes/No]? +tests: file://tests.csv +\end{minted} + +The configuration file demonstrates PromptFoo's capabilities for evaluating different LLM models. The YAML configuration defines three providers (\texttt{gpt-4o-mini}, \texttt{gpt-4}, and \texttt{gpt-3.5-turbo}) and sets up test assertions to validate their outputs. These assertions check important constraints: + +\begin{enumerate} + \item \textbf{Cost efficiency}: Each inference must cost less than \$0.001 + \item \textbf{Latency requirements}: Response time must be under 1000ms + \item \textbf{Output length}: Generated text must be less than 200 characters + \item \textbf{Output quality}: An LLM-based rubric evaluates if the output appears to be written by an expert (uses \texttt{openai:gpt-4o} model) +\end{enumerate} + +The prompts are loaded from an external file (\texttt{prompt1.txt}) and test cases are defined in \texttt{tests.csv}. This structured approach enables systematic evaluation of model performance across multiple decoupled dimensions. + +\begin{minted}{bash} +promptfoo eval --no-cache --output eval.json +\end{minted} + +This command will run the evaluation and store the results in \texttt{eval.json} while making sure that the evaluation is not cached so we are measuring actual latency of the LLMs. The code below processes the PromptFoo evaluation results stored in \texttt{eval.json}. It reads the evaluation data from the JSON file and extracts key metrics including: + +\begin{itemize} + \item Provider name (e.g. \texttt{gpt-4}, \texttt{gpt-3.5-turbo}) + \item Latency in milliseconds + \item Token usage statistics + \item Cost per request + \item Number of passed/failed assertions + \item Prompt token count + \item Total number of API requests +\end{itemize} + +\begin{minted}{python} +import json +import pandas as pd + +# Read the JSON file +with open('promptfoo/model_comparison/eval.json', 'r') as f: + eval_data = json.load(f) + +# Extract results into a list of dictionaries +results = [] +for prompt in eval_data['results']['prompts']: + result = { + 'provider': prompt['provider'], + 'latency_ms': prompt['metrics']['totalLatencyMs'], + 'token_usage': prompt['metrics']['tokenUsage']['total'], + 'cost': prompt['metrics']['cost'], + 'assert_pass': prompt['metrics']['assertPassCount'], + 'assert_fail': prompt['metrics']['assertFailCount'], + 'prompt_tokens': prompt['metrics']['tokenUsage']['prompt'], + 'num_requests': prompt['metrics']['tokenUsage']['numRequests'] + } + results.append(result) +\end{minted} + + +\begin{minted}{python} +# Convert to DataFrame +df = pd.DataFrame(results) +print(df) +\end{minted} + +\begin{table}[h] +\centering +\begin{tabular}{|l|r|r|r|} +\hline +Variable & openai:gpt-4o-mini & openai:gpt-4 & openai:gpt-3.5-turbo \\ +\hline +Latency (ms) & 2463 & 3773 & 1669 \\ +Token Usage & 97 & 103 & 95 \\ +Cost & \$0.000035 & \$0.004620 & \$0.000091 \\ +Assert Pass & 6 & 4 & 7 \\ +Assert Fail & 2 & 4 & 1 \\ +Prompt Tokens & 52 & 52 & 52 \\ +Num Requests & 2 & 2 & 2 \\ +\hline +\end{tabular} +\caption{Performance comparison across different OpenAI models} +\label{tab:model-comparison} +\end{table} +The evaluation results reveal interesting performance characteristics across different OpenAI models. \texttt{GPT-3.5-turbo} demonstrates the best overall performance given our criteria with the lowest latency (1669ms), lowest token usage (95), and highest number of passed assertions (7). While \texttt{GPT-4} shows higher token usage (103) and latency (3773ms), it also has the highest cost per request (\$0.00462). The \texttt{GPT-4-mini} variant offers a middle ground, with moderate latency and token usage, while maintaining relatively good assertion performance (6 passes). These results suggest that for this particular evaluation task, \texttt{GPT-3.5-turbo} provides the best balance of performance, reliability, and cost-effectiveness. + +Promptfoo also offers a web interface for visualizing the evaluation results as shown in Figure~\ref{fig:promptfoo1}. + +\begin{minted}{bash} +promptfoo view +\end{minted} + +We can observe results per test case (i.e. section of the SEC filing) and per provider. Humans can also manually review the results and provide feedback as well as generate new test cases. + + + +\begin{table*}[h] + \begin{tabular}{lll} + \hline + Model Family & Description & Models \\ + \hline + Llama3.2 Instruct \cite{meta_llama_models} & LLaMA architecture-based \newline pretrained and instruction-tuned \newline generative models & \texttt{Llama-3.2-1B-Instruct} and + \texttt{Llama-3.2-3B-Instruct} \\ + \hline + Qwen2.5 Instruct \cite{gpt2docs,hui2024qwen2,qwen2} & Instruction-tuned LLMs family \newline built by Alibaba Cloud & \texttt{Qwen2.5-0.5B-Instruct}, \texttt{Qwen2.5-1.5B-Instruct} and \texttt{Qwen2.5-3B-Instruct} \\ + \hline + SmolLM2 Instruct \cite{allal2024SmolLM2} & Instruction-tuned family of \newline compact language models \newline built by HuggingFace & \texttt{SmolLM2-360M-Instruct} and \texttt{SmolLM2-1.7B-Instruct} \\ + \hline + \end{tabular} + \caption{Model Families Evaluated Using LightEval} + \label{tab:model-families} + \end{table*} + + + +\begin{figure}[h] +\centering +\includegraphics[width=0.3\textwidth]{evals/promptfoo1.png} +\caption{PromptFoo evaluation results showing performance metrics across different models.} +\label{fig:promptfoo1} +\end{figure} + +Now that we have established \texttt{GPT-3.5-turbo} as our model of choice given the minimum required criteria based on cost, latency and basic qualitative evaluation, we can compare the performance of different prompts as a next evaluation step. Can we improve the quality of the summaries by using different prompts? + +First, we redefine our evaluation criteria. We now would like to select the prompt that delivers the most ``detailed'' summaries. Our updated promptfoo configuration file is shown below. + +\begin{minted}{python} +# Read the YAML file +with open('promptfoo/prompt_comparison/promptfooconfig.yaml', 'r') as f: + config = yaml.safe_load(f) + +# Pretty print the YAML content +print(yaml.dump(config, default_flow_style=False, sort_keys=False)) +\end{minted} + +\begin{minted}{yaml} +description: Best model eval +prompts: +- file://prompt1.txt +- file://prompt2.txt +- file://prompt3.txt +providers: +- openai:gpt-3.5-turbo +defaultTest: + assert: + - type: llm-rubric + value: 'Evaluate the output based on how detailed it is. Grade it on a scale + of 0.0 to 1.0, where: + + Score of 0.1: Not much detail. + + Score of 0.5: Some detail. + + Score of 1.0: Very detailed. + + ' +tests: file://tests.csv +\end{minted} + + +Note that we are now passing 3 different prompts. And we have updated our assertions to check if the output is ``detailed'' by leveraging promptfoo's \texttt{llm-rubric} assertion which will run an LLM-as-a-Judge for evaluation. Now, let's define 3 prompt variations we would like to test aiming at improving the quality/detail of the summaries. + +\begin{minted}{python} +# Display the prompt variations +from IPython.display import display, Markdown + +prompt_files = ['prompt1.txt', 'prompt2.txt', 'prompt3.txt'] +prompt_content = [] + +for file in prompt_files: + with open(f'promptfoo/prompt_comparison/{file}', 'r') as f: + content = f.read().strip() + prompt_content.append(f"### {file}\n---\n{content}\n") + +display(Markdown("\n\n".join(prompt_content))) +\end{minted} + +\begin{verbatim} +### prompt1.txt +--- +'Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' + + +### prompt2.txt +--- +'ROLE: You are a financial analyst. TASK: Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' + + +### prompt3.txt +--- +'ROLE: You are a financial analyst. REQUIREMENTS: BE DETAILED. TASK: Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' +\end{verbatim} + + +The first prompt matches our previous prompt. The second prompt adds a ``financial analyst'' role to the prompt. The third prompt expands on second prompt and add a requirement ``BE DETAILED''. + +We can now run the evaluation again. + +\begin{minted}{bash} +promptfoo eval --output eval.json +\end{minted} + +\begin{minted}{python} +# Read the evaluation results from JSON file +import json +with open('promptfoo/prompt_comparison/eval.json', 'r') as f: + eval_data = json.load(f) + +# Create a list to store the data +data = [] + +# Extract results for each test case +for result in eval_data['results']['results']: + section = result['vars']['section'] + prompt_id = result['promptId'] + score = result['gradingResult']['score'] if 'gradingResult' in result else 0.0 + + # Find the corresponding prompt file + for prompt in eval_data['results']['prompts']: + if prompt['id'] == prompt_id: + prompt_file = prompt['label'].split(':')[0] + break + + # Add to data list + data.append([section, prompt_file, score]) + +# Convert to DataFrame +df_raw = pd.DataFrame(data, columns=['Section', 'Prompt', 'Score']) + +# Pivot to get desired format +df = df_raw.pivot(index='Section', columns='Prompt', values='Score').reset_index() +df = df[['Section', 'prompt1.txt', 'prompt2.txt', 'prompt3.txt']] + +display(Markdown("### Prompt Comparison Results by Section")) +print(df) +\end{minted} + +\begin{table}[h] +\centering +\begin{tabular}{lccc} +\hline +Section & prompt1.txt & prompt2.txt & prompt3.txt \\ +\hline +Legal Proceedings & 0.1 & 0.5 & 1.0 \\ +Risk Factors & 0.1 & 0.5 & 0.5 \\ +\hline +\end{tabular} +\end{table} + +The results demonstrate that \texttt{prompt3.txt} exhibits superior performance for Legal Proceedings sections, attaining a perfect score of 1.0 compared to 0.5 for \texttt{prompt2.txt} and 0.1 for \texttt{prompt1.txt}. For Risk Factors sections, both \texttt{prompt2.txt} and \texttt{prompt3.txt} achieve moderate scores of 0.5, while \texttt{prompt1.txt} performs poorly with 0.1. This analysis suggests that \texttt{prompt3.txt} demonstrates greater effectiveness at extracting detailed information, particularly for legal content. The findings indicate that defining a Role and specifying requirements for detailed output serves as an effective approach to enhance summary quality, at least within the constraints of this specific task, model, and evaluation criteria. + +In conclusion, Promptfoo demonstrates its value as an effective LLM application evaluation tool, particularly through its capacity to decouple various components of the evaluation process. This decoupling enables users to concentrate on the most critical aspects of evaluation based on their specific application requirements and criteria, establishing Promptfoo as a versatile and valuable tool for LLM application development. + +\subsection{Comparison} + +Table~\ref{tab:tool-comparison} presents a comparative analysis of three open source frameworks for language models evaluation discussed: Lighteval, LangSmith, and Promptfoo. Each framework undergoes assessment based on key features including integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration. +\begin{table}[h] +\centering +\begin{tabular}{lp{3cm}p{3cm}p{3cm}} +\hline +\textbf{Feature/Aspect} & \textbf{Lighteval} & \textbf{LangSmith} & \textbf{Promptfoo} \\ +\hline +\textbf{Integration} & Seamless with Hugging Face models, easy access to multiple inference engines, and remote evaluation (e.g., TGI servers, HF serverless models) & User-provided models, evaluators, and metrics & CLI-based, user-provided models via YAML \\ +\hline +\textbf{Customization} & Flexible task and metric support, quick evaluation against state-of-the-art leaderboards & Easy setup of custom tasks and metrics with plain vanilla Python functions, lacks predefined tasks and metrics & Default and user-provided probes, metrics, and assertions \\ +\hline +\textbf{Ease of Use} & User-friendly, minimal setup & User-friendly, minimal setup, includes UI for result visualization & Simple CLI, rapid testing, includes UI for result visualization \\ +\hline +\textbf{Human/LLM Collaboration} & Model-based evaluation & Model-based evaluation & Supports human and model evaluators \\ +\hline +\end{tabular} +\caption{Comparison of Lighteval, LangSmith, and Promptfoo} +\label{tab:tool-comparison} +\end{table} + +\section{Conclusion} + +Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate. + +Success requires embracing this new paradigm by implementing comprehensive evals that cover the non-deterministic generative nature of LLMs - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth. + +The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs in software development. diff --git a/tamingllms/latex/intro.tex b/tamingllms/latex/intro.tex index f17ddc3..a280ad4 100644 --- a/tamingllms/latex/intro.tex +++ b/tamingllms/latex/intro.tex @@ -1,45 +1,46 @@ -\chapter{About the Book} - -\begin{epigraph} -I am always doing that which I cannot do, in order that I may learn how to do it. +\setchapterpreamble[u]{\margintoc} --- Pablo Picasso -\end{epigraph} +\chapter{About the Book} +\labch{intro} -\tableofcontents +\epigraph{I am always doing that which I cannot do, in order that I may learn how to do it.}{Pablo Picasso} \section{Core Challenges We'll Address} -In recent years, Large Language Models (LLMs) have emerged as a transformative force in technology, promising to revolutionize how we build products and interact with computers. From ChatGPT and LLama to GitHub Copilot and Claude Artifacts these systems have captured the public imagination and sparked a gold rush of AI-powered applications. However, beneath the surface of this technological revolution lies a complex landscape of challenges that software developers and tech leaders must navigate. +In recent years, Large Language Models (LLMs) have emerged as a transformative force in technology, promising to revolutionize how we build products and interact with computers. From ChatGPT and LLama to GitHub Copilot and Claude Artifacts these systems have captured the public imagination and sparked a gold rush of AI-powered applications. However, beneath the surface of this technological revolution lies a complex landscape of challenges that software developers and tech leaders must navigate. This book focuses on bringing awareness to key LLM limitations and harnessing open source solutions to overcome them for building robust AI-powered products. It offers a critical perspective on implementation challenges, backed by practical and reproducible Python examples. While many resources cover the capabilities of LLMs, this book specifically addresses the hidden complexities and pitfalls that engineers and technical leaders face when building LLM-powered applications while offering a comprehensive guide on how to leverage battle-tested open source tools and solutions. Throughout this book, we'll tackle the following (non-exhaustive) list of critical challenges: \begin{enumerate} -\item \textbf{Structural (un)Reliability}: LLMs struggle to maintain consistent output formats, complicating their integration into larger systems and making error handling more complex. + \item \textbf{Structural (un)Reliability}: LLMs struggle to maintain consistent output formats, complicating their integration into larger systems and making error handling more complex. -\item \textbf{Input Data Management}: LLMs are sensitive to input data format, operate with stale data and struggle with long-context requiring careful input data management and retrieval strategies. + \item \textbf{Input Data Management}: LLMs are sensitive to input data format, operate with stale data and struggle with long-context requiring careful input data management and retrieval strategies. -\item \textbf{Testing Complexity}: Traditional software testing methodologies break down when dealing with non-deterministic and generative systems, requiring new approaches. + \item \textbf{Testing Complexity}: Traditional software testing methodologies break down when dealing with non-deterministic and generative systems, requiring new approaches. -\item \textbf{Safety and Alignment}: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. + \item \textbf{Safety}: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. -\item \textbf{Vendor Lock-in}: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. + \item \textbf{Alignment}: LLMs are next-token prediction models, which means they are not aligned with the user's preferences by default. -\item \textbf{Cost Optimization}: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. + \item \textbf{Vendor Lock-in}: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. + + \item \textbf{Cost Optimization}: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. \end{enumerate} +We conclude with a discussion on the future of LLMs and the challenges that will arise as we move forward. + \section{A Practical Approach} This book takes a hands-on approach to these challenges, with a focus on accessibility and reproducibility. All examples and code are: \begin{itemize} -\item Fully reproducible and documented, allowing readers to replicate results exactly -\item Designed to run on consumer-grade hardware without requiring expensive resources -\item Available as open source Python notebooks that can be modified and extended -\item Structured to minimize computational costs while maintaining effectiveness + \item Fully reproducible and documented, allowing readers to replicate results exactly + \item Designed to run on consumer-grade hardware without requiring expensive resources + \item Available as open source Python notebooks that can be modified and extended + \item Structured to minimize computational costs while maintaining effectiveness \end{itemize} \section{An Open Source Approach} @@ -47,10 +48,10 @@ \section{An Open Source Approach} Throughout this book, we'll leverage open source tools and frameworks to address common LLM challenges. In that way, we are prioritizing: \begin{itemize} -\item \textbf{Transparency}: Open source solutions provide visibility into how challenges are being addressed, allowing for better understanding and customization of solutions. -\item \textbf{Flexibility}: Open source tools can be modified and adapted to specific use cases, unlike black-box commercial solutions. -\item \textbf{Cost-Effectiveness}: Most of open source tools we will cover are freely available, fostering accessibility and reducing costs. -\item \textbf{Vendor Independence}: Open source solutions reduce dependency on specific providers, offering more freedom in architectural decisions. + \item \textbf{Transparency}: Open source solutions provide visibility into how challenges are being addressed, allowing for better understanding and customization of solutions. + \item \textbf{Flexibility}: Open source tools can be modified and adapted to specific use cases, unlike black-box commercial solutions. + \item \textbf{Cost-Effectiveness}: Most of open source tools we will cover are freely available, fostering accessibility and reducing costs. + \item \textbf{Vendor Independence}: Open source solutions reduce dependency on specific providers, offering more freedom in architectural decisions. \end{itemize} \section{Open Source Book} @@ -58,10 +59,10 @@ \section{Open Source Book} In keeping with these open source principles, this book itself is open source and available on GitHub. It's designed to be a living document that evolves with the changing landscape of LLM technology and implementation practices. Readers are encouraged to: \begin{itemize} -\item Report issues or suggest improvements through GitHub Issues -\item Contribute new examples or solutions via Pull Requests -\item Share their own experiences and solutions with the community -\item Propose new chapters or sections that address emerging challenges + \item Report issues or suggest improvements through GitHub Issues + \item Contribute new examples or solutions via Pull Requests + \item Share their own experiences and solutions with the community + \item Propose new chapters or sections that address emerging challenges \end{itemize} The repository can be found at \url{https://github.com/souzatharsis/tamingllms}. Whether you've found a typo, have a better solution to share, or want to contribute, your contributions are welcome. Please feel free to open an issue in the book repository. @@ -73,55 +74,55 @@ \section{A Note on Perspective} The current discourse around LLMs tends toward extremes - either uncritical enthusiasm or wholesale dismissal. This book takes a different approach: \begin{itemize} -\item \textbf{Practical Implementation Focus}: Rather than theoretical capabilities, we examine practical challenges and their solutions. -\item \textbf{Code-First Learning}: Every concept is illustrated with executable Python examples, enabling immediate practical application. -\item \textbf{Critical Analysis}: We provide a balanced examination of both capabilities and limitations, helping readers make informed decisions about LLM integration. + \item \textbf{Practical Implementation Focus}: Rather than theoretical capabilities, we examine practical challenges and their solutions. + \item \textbf{Code-First Learning}: Every concept is illustrated with executable Python examples, enabling immediate practical application. + \item \textbf{Critical Analysis}: We provide a balanced examination of both capabilities and limitations, helping readers make informed decisions about LLM integration. \end{itemize} \section{Who This Book Is For} -This book is designed for: +This book is designed for: \begin{itemize} -\item Software/AI Engineers building LLM-powered applications -\item Technical Product Managers leading GenAI initiatives -\item Technical Leaders making architectural decisions -\item Open Source advocates and/or developers building LLM Applications -\item Anyone seeking to understand the practical challenges of working with LLMs + \item Software/AI Engineers building LLM-powered applications + \item Technical Product Managers leading GenAI initiatives + \item Technical Leaders making architectural decisions + \item Open Source advocates and/or developers building LLM Applications + \item Anyone seeking to understand the practical challenges of working with LLMs \end{itemize} Typical job roles: \begin{itemize} -\item Software/AI Engineers building AI-powered platforms -\item Backend Developers integrating LLMs into existing systems -\item ML Engineers transitioning to LLM implementation -\item Technical Leads making architectural decisions -\item Product Managers overseeing GenAI initiatives + \item Software/AI Engineers building AI-powered platforms + \item Backend Developers integrating LLMs into existing systems + \item ML Engineers transitioning to LLM implementation + \item Technical Leads making architectural decisions + \item Product Managers overseeing GenAI initiatives \end{itemize} Reader motivation: \begin{itemize} -\item Need to build reliable, production-ready LLM applications -\item Desire to understand and overcome common LLM implementation challenges -\item Requirement to optimize costs and performance -\item Need to ensure safety and reliability in LLM-powered systems + \item Need to build reliable, production-ready LLM applications + \item Desire to understand and overcome common LLM implementation challenges + \item Requirement to optimize costs and performance + \item Need to ensure safety and reliability in LLM-powered systems \end{itemize} -The goal is to help readers understand and address these challenges early, before they become costly problems too late in the software development lifecycle. +The goal is to help readers understand and address these challenges early, before they become costly problems too late in the software development lifecycle. \section{Outcomes} After reading this book, the reader will understand critical LLM limitations and their implications and have practical experience on recommended open source tools and frameworks to help navigate common LLM pitfalls. The reader will be able to: \begin{itemize} -\item Implement effective strategies for managing LLMs limitations -\item Build reliable LLM-powered applications -\item Create robust testing frameworks for LLM-based systems -\item Deploy proper LLM safeguards -\item Make realistic effort estimations for LLM-based projects -\item Understand the hidden complexities that impact development timelines + \item Implement effective strategies for managing LLMs limitations + \item Build reliable LLM-powered applications + \item Create robust testing frameworks for LLM-based systems + \item Deploy proper LLM safeguards + \item Make realistic effort estimations for LLM-based projects + \item Understand the hidden complexities that impact development timelines \end{itemize} \section{Prerequisites} @@ -129,10 +130,10 @@ \section{Prerequisites} To make the most of this book, you should have: \begin{itemize} -\item Basic Python programming experience -\item Basic knowledge of LLMs and their capabilities -\item Access to and basic knowledge of LLM APIs (Mistral, OpenAI, Anthropic, or similar) -\item A desire to build reliable LLM-based applications + \item Basic Python programming experience + \item Basic knowledge of LLMs and their capabilities + \item Access to and basic knowledge of LLM APIs (Mistral, OpenAI, Anthropic, or similar) + \item A desire to build reliable LLM-based applications \end{itemize} \section{Setting Up Your Environment} @@ -141,28 +142,29 @@ \section{Setting Up Your Environment} \subsection{Code Repository} Clone the book's companion repository: -\begin{verbatim} +\begin{minted}{bash} git clone https://github.com/souzatharsis/tamingllms.git cd tamingllms/notebooks -\end{verbatim} +\end{minted} \subsection{Python Environment Setup} -\begin{verbatim} +\begin{minted}{bash} # Create and activate a virtual environment python -m venv taming-llms-env source taming-llms-env/bin/activate # On Windows, use: taming-llms-env\Scripts\activate -\end{verbatim} +\end{minted} + We will try and make each Chapter as self-contained as possible, including all necessary installs as we go through the examples. Feel free to use your preferred package manager to install the dependencies (e.g. \texttt{pip}). We used \texttt{poetry} to manage dependencies and virtual environments. \subsection{API Keys Configuration} \begin{enumerate} -\item Create a \texttt{.env} file in the root directory of the project. -\item Add your API keys and other sensitive information to the \texttt{.env} file. For example: + \item Create a \texttt{.env} file in the root directory of the project. + \item Add your API keys and other sensitive information to the \texttt{.env} file. For example: -\begin{verbatim} -OPENAI_API_KEY=your_openai_api_key_here -\end{verbatim} + \begin{minted}{bash} + OPENAI_API_KEY=your_openai_api_key_here + \end{minted} \end{enumerate} \begin{note} @@ -171,15 +173,15 @@ \subsection{API Keys Configuration} \subsection{Troubleshooting Common Issues} \begin{itemize} -\item If you encounter API rate limits, consider using smaller examples or implementing retry logic -\item For package conflicts, try creating a fresh virtual environment or use a package manager like \texttt{poetry} -\item Check the book's repository issues page for known problems and solutions + \item If you encounter API rate limits, consider using smaller examples or implementing retry logic + \item For package conflicts, try creating a fresh virtual environment or use a package manager like \texttt{poetry} + \item Check the book's repository issues page for known problems and solutions \end{itemize} Now that your environment is set up, let's begin our exploration of LLM challenges. \section{About the Author} -Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (\textit{incoming}) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students \& working professionals to help create a more diverse global AI1 ecosystem. +Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (\textit{incoming}) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students \& working professionals to help create a more diverse global AI ecosystem. With over 15 years of experience delivering technology products across startups and Fortune 500 companies, he is also an author of numerous scholarly publications and a frequent speaker at academic and business conferences. Grounded on academic background and drawing from practical experience building and scaling up products powered by language models at early-stage startups, major institutions as well as contributing to open source projects, he brings a unique perspective on bridging the gap between LLMs promised potential and their practical implementation challenges to enable the next generation of AI-powered products. diff --git a/tamingllms/markdown/evals.md b/tamingllms/markdown/evals.md index 9f732af..fac4a64 100644 --- a/tamingllms/markdown/evals.md +++ b/tamingllms/markdown/evals.md @@ -1,35 +1,42 @@ -# Challenges of Evaluating LLM-based Applications +(evals)= +# The Evals Gap ```{epigraph} -Evals are surprisingly often all you need. +It doesn't matter how beautiful your theory is,
    +it doesn't matter how smart you are.
    +If it doesn't agree with experiment, it's wrong. --- Greg Brockman, OpenAI's President +-- Richard Feynman ``` ```{contents} ``` -## Non-Deterministic Machines -One of the most fundamental challenges when building products with Large Language Models (LLMs) is their non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate different responses each time they're queried - even with identical prompts and input data. This characteristic is both a strength and a significant engineering challenge. +## Introduction -When you ask ChatGPT or any other LLM the same question multiple times, you'll likely get different responses. This isn't a bug - it's a fundamental feature of how these models work. The "temperature" parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it incredibly difficult to build reliable, testable systems. +The advent of LLMs marks a pivotal shift in the landscape of software development, testing and verification. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products. + +For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks. + +To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front that incorporate the generative nature of LLM-based software development while fostering a culture of continuous change, learning and adaptation. + + +## Non-Deterministic Generative Machines + +One of the most fundamental challenges when building products with LLMs is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they're queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering and product challenge. + +When you ask an LLM the same question multiple times, you'll likely get different responses. This isn't a bug - it's a fundamental feature of how these models work. The "temperature" parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems. Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that: -- The same market data could yield different analysis conclusions -- Testing becomes exceedingly more complex compared to traditional software +- The same input data could yield different analysis conclusions - Regulatory compliance becomes challenging to guarantee - User trust may be affected by inconsistent responses - -### Temperature and Sampling +- Testing becomes exceedingly more complex compared to traditional software The primary source of non-determinism in LLMs comes from their sampling strategies. During text generation, the model: 1. Calculates probability distributions for each next token 2. Samples from these distributions based on temperature settings -3. Uses techniques like nucleus sampling to balance creativity and coherence - -### The Temperature Spectrum +3. Uses techniques like nucleus sampling {cite}`holtzman2020curiouscaseneuraltext` or top-k sampling to balance creativity and coherence -- Temperature = 0: Most deterministic, but potentially repetitive -- Temperature = 1: Balanced creativity and coherence -- Temperature > 1: Increased randomness, potentially incoherent +In this simple experiment, we use an LLM to write a single-statement executive summary from an input financial filing. We observe that even a simple parameter like temperature can dramatically alter model behavior in ways that are difficult to systematically assess. At temperature 0.0, responses are consistent but potentially too rigid. At 1.0, outputs become more varied but less predictable. At 2.0, responses can be wildly different and often incoherent. This non-deterministic behavior makes traditional software testing approaches inadequate. ```python @@ -84,6 +91,25 @@ def generate_responses( ``` +```python +MAX_LENGTH = 10000 # We limit the input length to avoid token issues +with open('../data/apple.txt', 'r') as file: + sec_filing = file.read() +``` + + +```python +sec_filing +``` + + + + + 'UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n \nFORM 10-K\n \n(Mark One)\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended September 28, 2024\nor\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from to .\nCommission File Number: 001-36743\n \ng66145g66i43.jpg\nApple Inc.\n(Exact name of Registrant as specified in its charter)\n \nCalifornia\t\t94-2404110\n(State or other jurisdiction\nof incorporation or organization)\n(I.R.S. Employer Identification No.)\nOne Apple Park Way\t\t\nCupertino, California\n95014\n(Address of principal executive offices)\t\t(Zip Code)\n \n(408) 996-1010\n(Registrant’s telephone number, including area code)\n \nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\tTrading symbol(s)\tName of each exchange on which registered\nCommon Stock, $0.00001 par value per share\nAAPL\tThe Nasdaq Stock Market LLC\n0.000% Notes due 2025\t—\tThe Nasdaq Stock Market LLC\n0.875% Notes due 2025\t—\tThe Nasdaq Stock Market LLC\n1.625% Notes due 2026\t—\tThe Nasdaq Stock Market LLC\n2.000% Notes due 2027\t—\tThe Nasdaq Stock Market LLC\n1.375% Notes due 2029\t—\tThe Nasdaq Stock Market LLC\n3.050% Notes due 2029\t—\tThe Nasdaq Stock Market LLC\n0.500% Notes due 2031\t—\tThe Nasdaq Stock Market LLC\n3.600% Notes due 2042\t—\tThe Nasdaq Stock Market LLC\n \nSecurities registered pursuant to Section 12(g) of the Act: None\n \nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act.\nYes ☒ No ☐\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act.\nYes ☐ No ☒\n\nIndicate by check mark whether the Registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days.\nYes ☒ No ☐\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the Registrant was required to submit such files).\nYes ☒ No ☐\nIndicate by check mark whether the Registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions of “large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.\nLarge accelerated filer\t\t☒\t\tAccelerated filer\t\t☐\nNon-accelerated filer\t\t☐\t\tSmaller reporting company\t\t☐\nEmerging growth company\t\t☐\n \nIf an emerging growth company, indicate by check mark if the Registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards provided pursuant to Section 13(a) of the Exchange Act. ☐\nIndicate by check mark whether the Registrant has filed a report on and attestation to its management’s assessment of the effectiveness of its internal control over financial reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit report. ☒\nIf securities are registered pursuant to Section 12(b) of the Act, indicate by check mark whether the financial statements of the registrant included in the filing reflect the correction of an error to previously issued financial statements. ☐\nIndicate by check mark whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the registrant’s executive officers during the relevant recovery period pursuant to §240.10D-1(b). ☐\nIndicate by check mark whether the Registrant is a shell company (as defined in Rule 12b-2 of the Act).\nYes ☐ No ☒\nThe aggregate market value of the voting and non-voting stock held by non-affiliates of the Registrant, as of March 29, 2024, the last business day of the Registrant’s most recently completed second fiscal quarter, was approximately $2,628,553,000,000. Solely for purposes of this disclosure, shares of common stock held by executive officers and directors of the Registrant as of such date have been excluded because such persons may be deemed to be affiliates. This determination of executive officers and directors as affiliates is not necessarily a conclusive determination for any other purposes.\n15,115,823,000 shares of common stock were issued and outstanding as of October 18, 2024.\nDOCUMENTS INCORPORATED BY REFERENCE\nPortions of the Registrant’s definitive proxy statement relating to its 2025 annual meeting of shareholders are incorporated by reference into Part III of this Annual Report on Form 10-K where indicated. The Registrant’s definitive proxy statement will be filed with the U.S. Securities and Exchange Commission within 120 days after the end of the fiscal year to which this report relates.\n \n\n\nApple Inc.\n\nForm 10-K\nFor the Fiscal Year Ended September 28, 2024\nTABLE OF CONTENTS\n\nPage\nPart I\nItem 1.\nBusiness\n1\nItem 1A.\nRisk Factors\n5\nItem 1B.\nUnresolved Staff Comments\n17\nItem 1C.\nCybersecurity\n17\nItem 2.\nProperties\n18\nItem 3.\nLegal Proceedings\n18\nItem 4.\nMine Safety Disclosures\n18\nPart II\nItem 5.\nMarket for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities\n19\nItem 6.\n[Reserved]\n20\nItem 7.\nManagement’s Discussion and Analysis of Financial Condition and Results of Operations\n21\nItem 7A.\nQuantitative and Qualitative Disclosures About Market Risk\n27\nItem 8.\nFinancial Statements and Supplementary Data\n28\nItem 9.\nChanges in and Disagreements with Accountants on Accounting and Financial Disclosure\n51\nItem 9A.\nControls and Procedures\n51\nItem 9B.\nOther Information\n52\nItem 9C.\nDisclosure Regarding Foreign Jurisdictions that Prevent Inspections\n52\nPart III\nItem 10.\nDirectors, Executive Officers and Corporate Governance\n52\nItem 11.\nExecutive Compensation\n52\nItem 12.\nSecurity Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters\n52\nItem 13.\nCertain Relationships and Related Transactions, and Director Independence\n52\nItem 14.\nPrincipal Accountant Fees and Services\n52\nPart IV\nItem 15.\nExhibit and Financial Statement Schedules\n53\nItem 16.\nForm 10-K Summary\n56\n \n\n\nThis Annual Report on Form 10-K (“Form 10-K”) contains forward-looking statements, within the meaning of the Private Securities Litigation Reform Act of 1995, that involve risks and uncertainties. Many of the forward-looking statements are located in Part I, Item 1 of this Form 10-K under the heading “Business” and Part II, Item 7 of this Form 10-K under the heading “Management’s Discussion and Analysis of Financial Condition and Results of Operations.” Forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. For example, statements in this Form 10-K regarding the potential future impact of macroeconomic conditions on the Company’s business and results of operations are forward-looking statements. Forward-looking statements can also be identified by words such as “future,” “anticipates,” “believes,” “estimates,” “expects,” “intends,” “plans,” “predicts,” “will,” “would,” “could,” “can,” “may,” and similar terms. Forward-looking statements are not guarantees of future performance and the Company’s actual results may differ significantly from the results discussed in the forward-looking statements. Factors that might cause such differences include, but are not limited to, those discussed in Part I, Item 1A of this Form 10-K under the heading “Risk Factors.” The Company assumes no obligation to revise or update any forward-looking statements for any reason, except as required by law.\nUnless otherwise stated, all information presented herein is based on the Company’s fiscal calendar, and references to particular years, quarters, months or periods refer to the Company’s fiscal years ended in September and the associated quarters, months and periods of those fiscal years. Each of the terms the “Company” and “Apple” as used herein refers collectively to Apple Inc. and its wholly owned subsidiaries, unless otherwise stated.\nPART I\nItem 1. Business\nCompany Background\nThe Company designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services. The Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September.\nProducts\niPhone\niPhone® is the Company’s line of smartphones based on its iOS operating system. The iPhone line includes iPhone 16 Pro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE®.\nMac\nMac® is the Company’s line of personal computers based on its macOS® operating system. The Mac line includes laptops MacBook Air® and MacBook Pro®, as well as desktops iMac®, Mac mini®, Mac Studio® and Mac Pro®.\niPad\niPad® is the Company’s line of multipurpose tablets based on its iPadOS® operating system. The iPad line includes iPad Pro®, iPad Air®, iPad and iPad mini®.\nWearables, Home and Accessories\nWearables includes smartwatches, wireless headphones and spatial computers. The Company’s line of smartwatches, based on its watchOS® operating system, includes Apple Watch Ultra® 2, Apple Watch® Series 10 and Apple Watch SE®. The Company’s line of wireless headphones includes AirPods®, AirPods Pro®, AirPods Max® and Beats® products. Apple Vision Pro™ is the Company’s first spatial computer based on its visionOS™ operating system.\nHome includes Apple TV®, the Company’s media streaming and gaming device based on its tvOS® operating system, and HomePod® and HomePod mini®, high-fidelity wireless smart speakers.\nAccessories includes Apple-branded and third-party accessories.\nApple Inc. | 2024 Form 10-K | 1\n\nServices\nAdvertising\nThe Company’s advertising services include third-party licensing arrangements and the Company’s own advertising platforms.\nAppleCare\nThe Company offers a portfolio of fee-based service and support products under the AppleCare® brand. The offerings provide priority access to Apple technical support, access to the global Apple authorized service network for repair and replacement services, and in many cases additional coverage for instances of accidental damage or theft and loss, depending on the country and type of product.\nCloud Services\nThe Company’s cloud services store and keep customers’ content up-to-date and available across multiple Apple devices and Windows personal computers.\nDigital Content\nThe Company operates various platforms, including the App Store®, that allow customers to discover and download applications and digital content, such as books, music, video, games and podcasts.\nThe Company also offers digital content through subscription-based services, including Apple Arcade®, a game subscription service; Apple Fitness+SM, a personalized fitness service; Apple Music®, which offers users a curated listening experience with on-demand radio stations; Apple News+®, a subscription news and magazine service; and Apple TV+®, which offers exclusive original content and live sports.\nPayment Services\nThe Company offers payment services, including Apple Card®, a co-branded credit card, and Apple Pay®, a cashless payment service.\nSegments\nThe Company manages its business primarily on a geographic basis. The Company’s reportable segments consist of the Americas, Europe, Greater China, Japan and Rest of Asia Pacific. Americas includes both North and South America. Europe includes European countries, as well as India, the Middle East and Africa. Greater China includes China mainland, Hong Kong and Taiwan. Rest of Asia Pacific includes Australia and those Asian countries not included in the Company’s other reportable segments. Although the reportable segments provide similar hardware and software products and similar services, each one is managed separately to better align with the location of the Company’s customers and distribution partners and the unique market dynamics of each geographic region.\nMarkets and Distribution\nThe Company’s customers are primarily in the consumer, small and mid-sized business, education, enterprise and government markets. The Company sells its products and resells third-party products in most of its major markets directly to customers through its retail and online stores and its direct sales force. The Company also employs a variety of indirect distribution channels, such as third-party cellular network carriers, wholesalers, retailers and resellers. During 2024, the Company’s net sales through its direct and indirect distribution channels accounted for 38% and 62%, respectively, of total net sales.\nCompetition\nThe markets for the Company’s products and services are highly competitive, and are characterized by aggressive price competition and resulting downward pressure on gross margins, frequent introduction of new products and services, short product life cycles, evolving industry standards, continual improvement in product price and performance characteristics, rapid adoption of technological advancements by competitors, and price sensitivity on the part of consumers and businesses. Many of the Company’s competitors seek to compete primarily through aggressive pricing and very low cost structures, and by imitating the Company’s products and infringing on its intellectual property.\nApple Inc. | 2024 Form 10-K | 2\n\nThe Company’s ability to compete successfully depends heavily on ensuring the continuing and timely introduction of innovative new products, services and technologies to the marketplace. The Company designs and develops nearly the entire solution for its products, including the hardware, operating system, numerous software applications and related services. Principal competitive factors important to the Company include price, product and service features (including security features), relative price and performance, product and service quality and reliability, design innovation, a strong third-party software and accessories ecosystem, marketing and distribution capability, service and support, and corporate reputation.\nThe Company is focused on expanding its market opportunities related to smartphones, personal computers, tablets, wearables and accessories, and services. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software, and service offerings with large customer bases. In addition, some of the Company’s competitors have broader product lines, lower-priced products and a larger installed base of active devices. Competition has been particularly intense as competitors have aggressively cut prices and lowered product margins. Certain competitors have the resources, experience or cost structures to provide products at little or no profit or even at a loss. The Company’s services compete with business models that provide content to users for free and use illegitimate means to obtain third-party digital content and applications. The Company faces significant competition as competitors imitate the Company’s product features and applications within their products, or collaborate to offer integrated solutions that are more competitive than those they currently offer.\nSupply of Components\nAlthough most components essential to the Company’s business are generally available from multiple sources, certain components are currently obtained from single or limited sources. The Company also competes for various components with other participants in the markets for smartphones, personal computers, tablets, wearables and accessories. Therefore, many components used by the Company, including those that are available from multiple sources, are at times subject to industry-wide shortage and significant commodity pricing fluctuations.\nThe Company uses some custom components that are not commonly used by its competitors, and new products introduced by the Company often utilize custom components available from only one source. When a component or product uses new technologies, initial capacity constraints may exist until the suppliers’ yields have matured or their manufacturing capacities have increased. The continued availability of these components at acceptable prices, or at all, may be affected if suppliers decide to concentrate on the production of common components instead of components customized to meet the Company’s requirements.\nThe Company has entered into agreements for the supply of many components; however, there can be no guarantee that the Company will be able to extend or renew these agreements on similar terms, or at all.\nResearch and Development\nBecause the industries in which the Company competes are characterized by rapid technological advances, the Company’s ability to compete successfully depends heavily upon its ability to ensure a continual and timely flow of competitive products, services and technologies to the marketplace. The Company continues to develop new technologies to enhance existing products and services, and to expand the range of its offerings through research and development (“R&D”), licensing of intellectual property and acquisition of third-party businesses and technology.\nIntellectual Property\nThe Company currently holds a broad collection of intellectual property rights relating to certain aspects of its hardware, accessories, software and services. This includes patents, designs, copyrights, trademarks, trade secrets and other forms of intellectual property rights in the U.S. and various foreign countries. Although the Company believes the ownership of such intellectual property rights is an important factor in differentiating its business and that its success does depend in part on such ownership, the Company relies primarily on the innovative skills, technical competence and marketing abilities of its personnel.\nThe Company regularly files patent, design, copyright and trademark applications to protect innovations arising from its research, development, design and marketing, and is currently pursuing thousands of applications around the world. Over time, the Company has accumulated a large portfolio of issued and registered intellectual property rights around the world. No single intellectual property right is solely responsible for protecting the Company’s products and services. The Company believes the duration of its intellectual property rights is adequate relative to the expected lives of its products and services.\nIn addition to Company-owned intellectual property, many of the Company’s products and services are designed to include intellectual property owned by third parties. It may be necessary in the future to seek or renew licenses relating to various aspects of the Company’s products, processes and services. While the Company has generally been able to obtain such licenses on commercially reasonable terms in the past, there is no guarantee that such licenses could be obtained in the future on reasonable terms or at all.\nApple Inc. | 2024 Form 10-K | 3\n\nBusiness Seasonality and Product Introductions\nThe Company has historically experienced higher net sales in its first quarter compared to other quarters in its fiscal year due in part to seasonal holiday demand. Additionally, new product and service introductions can significantly impact net sales, cost of sales and operating expenses. The timing of product introductions can also impact the Company’s net sales to its indirect distribution channels as these channels are filled with new inventory following a product launch, and channel inventory of an older product often declines as the launch of a newer product approaches. Net sales can also be affected when consumers and distributors anticipate a product introduction.\nHuman Capital\nThe Company believes that its people play an important role in its success, and strives to attract, develop and retain the best talent. The Company works to create an inclusive, safe and supportive environment for all of its team members, so that its people can do the best work of their lives. As of September 28, 2024, the Company had approximately 164,000 full-time equivalent employees.\nCompensation and Benefits\nThe Company believes that compensation should be competitive and equitable, and should enable employees to share in the Company’s success. The Company recognizes its people are most likely to thrive when they have the resources to meet their needs and the time and support to succeed in their professional and personal lives. In support of this, the Company offers a wide variety of benefits for employees around the world, including health, wellness and time away.\nGrowth and Development\nThe Company invests in resources to help its people develop and achieve their career goals. The Company offers programs through Apple University on leadership, management and influence, as well as Apple culture and values. Team members can also take advantage of online classes for business, technical and personal development, as well as learning opportunities to support their well-being.\nWorkplace Practices and Policies\nThe Company is an equal opportunity employer committed to inclusion and diversity and to providing a workplace free of harassment or discrimination.\nInclusion and Diversity\nThe Company is committed to its vision to build and sustain a more inclusive workforce that is representative of the communities it serves. The Company continues to work to increase diverse representation at every level, foster an inclusive culture, and support equitable pay and access to opportunity for all employees.\nEngagement\nThe Company believes that open and honest communication among team members, managers and leaders helps create an open, collaborative work environment where everyone can contribute, grow and succeed. Team members are encouraged to come to their managers with questions, feedback or concerns, and the Company conducts surveys that gauge employee sentiment in areas like career development, manager performance and inclusivity.\nHealth and Safety\nThe Company is committed to protecting its team members everywhere it operates. The Company identifies potential workplace risks in order to develop measures to mitigate possible hazards. The Company supports employees with general safety, security and crisis management training, and by putting specific programs in place for those working in potentially high-hazard environments. Additionally, the Company works to protect the safety and security of its team members, visitors and customers through its global security team.\nApple Inc. | 2024 Form 10-K | 4\n\nAvailable Information\nThe Company’s Annual Reports on Form 10-K, Quarterly Reports on Form 10-Q, Current Reports on Form 8-K, and amendments to reports filed pursuant to Sections 13(a) and 15(d) of the Securities Exchange Act of 1934, as amended (the “Exchange Act”), are filed with the U.S. Securities and Exchange Commission (the “SEC”). Such reports and other information filed by the Company with the SEC are available free of charge at investor.apple.com/investor-relations/sec-filings/default.aspx when such reports are available on the SEC’s website. The Company periodically provides certain information for investors on its corporate website, www.apple.com, and its investor relations website, investor.apple.com. This includes press releases and other information about financial performance, information on environmental, social and governance matters, and details related to the Company’s annual meeting of shareholders. The information contained on the websites referenced in this Form 10-K is not incorporated by reference into this filing. Further, the Company’s references to website URLs are intended to be inactive textual references only.\nItem 1A. Risk Factors\nThe Company’s business, reputation, results of operations, financial condition and stock price can be affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these risks materialize from time to time, the Company’s business, reputation, results of operations, financial condition and stock price can be materially and adversely affected.\nBecause of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical trends to anticipate results or trends in future periods. This discussion of risk factors contains forward-looking statements.\nThis section should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and accompanying notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.\nMacroeconomic and Industry Risks\nThe Company’s operations and performance depend significantly on global and regional economic conditions and adverse economic conditions can materially adversely affect the Company’s business, results of operations and financial condition.\nThe Company has international operations with sales outside the U.S. representing a majority of the Company’s total net sales. In addition, the Company’s global supply chain is large and complex and a majority of the Company’s supplier facilities, including manufacturing and assembly sites, are located outside the U.S. As a result, the Company’s operations and performance depend significantly on global and regional economic conditions.\nAdverse macroeconomic conditions, including slow growth or recession, high unemployment, inflation, tighter credit, higher interest rates, and currency fluctuations, can adversely impact consumer confidence and spending and materially adversely affect demand for the Company’s products and services. In addition, consumer confidence and spending can be materially adversely affected in response to changes in fiscal and monetary policy, financial market volatility, declines in income or asset values, and other economic factors.\nIn addition to an adverse impact on demand for the Company’s products and services, uncertainty about, or a decline in, global or regional economic conditions can have a significant impact on the Company’s suppliers, contract manufacturers, logistics providers, distributors, cellular network carriers and other channel partners, and developers. Potential outcomes include financial instability; inability to obtain credit to finance business operations; and insolvency.\nAdverse economic conditions can also lead to increased credit and collectibility risk on the Company’s trade receivables; the failure of derivative counterparties and other financial institutions; limitations on the Company’s ability to issue new debt; reduced liquidity; and declines in the fair values of the Company’s financial instruments. These and other impacts can materially adversely affect the Company’s business, results of operations, financial condition and stock price.\nApple Inc. | 2024 Form 10-K | 5\n\nThe Company’s business can be impacted by political events, trade and other international disputes, geopolitical tensions, conflict, terrorism, natural disasters, public health issues, industrial accidents and other business interruptions.\nPolitical events, trade and other international disputes, geopolitical tensions, conflict, terrorism, natural disasters, public health issues, industrial accidents and other business interruptions can have a material adverse effect on the Company and its customers, employees, suppliers, contract manufacturers, logistics providers, distributors, cellular network carriers and other channel partners.\nThe Company has a large, global business with sales outside the U.S. representing a majority of the Company’s total net sales, and the Company believes that it generally benefits from growth in international trade. Substantially all of the Company’s manufacturing is performed in whole or in part by outsourcing partners located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam. Restrictions on international trade, such as tariffs and other controls on imports or exports of goods, technology or data, can materially adversely affect the Company’s business and supply chain. The impact can be particularly significant if these restrictive measures apply to countries and regions where the Company derives a significant portion of its revenues and/or has significant supply chain operations. Restrictive measures can increase the cost of the Company’s products and the components and raw materials that go into them, and can require the Company to take various actions, including changing suppliers, restructuring business relationships and operations, and ceasing to offer and distribute affected products, services and third-party applications to its customers. Changing the Company’s business and supply chain in accordance with new or changed restrictions on international trade can be expensive, time-consuming and disruptive to the Company’s operations. Such restrictions can be announced with little or no advance notice, which can create uncertainty, and the Company may not be able to effectively mitigate all adverse impacts from such measures. For example, tensions between governments, including the U.S. and China, have in the past led to tariffs and other restrictions affecting the Company’s business. If disputes and conflicts further escalate in the future, actions by governments in response could be significantly more severe and restrictive and could materially adversely affect the Company’s business.\nMany of the Company’s operations and facilities, as well as critical business operations of the Company’s suppliers and contract manufacturers, are in locations that are prone to earthquakes and other natural disasters. Global climate change is resulting in certain types of natural disasters and extreme weather occurring more frequently or with more intense effects. In addition, the Company’s and its suppliers’ operations and facilities are subject to the risk of interruption by fire, power shortages, nuclear power plant accidents and other industrial accidents, terrorist attacks and other hostile acts, ransomware and other cybersecurity attacks, labor disputes, public health issues and other events beyond the Company’s control. For example, global supply chains can be highly concentrated and geopolitical tensions or conflict could result in significant disruptions.\nSuch events can make it difficult or impossible for the Company to manufacture and deliver products to its customers, create delays and inefficiencies in the Company’s supply and manufacturing chain, result in slowdowns and outages to the Company’s service offerings, increase the Company’s costs, and negatively impact consumer spending and demand in affected areas.\nThe Company’s operations are also subject to the risks of industrial accidents at its suppliers and contract manufacturers. While the Company’s suppliers are required to maintain safe working environments and operations, an industrial accident could occur and could result in serious injuries or loss of life, disruption to the Company’s business, and harm to the Company’s reputation. Major public health issues, including pandemics such as the COVID-19 pandemic, have adversely affected, and could in the future materially adversely affect, the Company due to their impact on the global economy and demand for consumer products; the imposition of protective public safety measures, such as stringent employee travel restrictions and limitations on freight services and the movement of products between regions; and disruptions in the Company’s operations, supply chain and sales and distribution channels, resulting in interruptions to the supply of current products and offering of existing services, and delays in production ramps of new products and development of new services.\nFollowing any interruption to its business, the Company can require substantial recovery time, experience significant expenditures to resume operations, and lose significant sales. Because the Company relies on single or limited sources for the supply and manufacture of many critical components, a business interruption affecting such sources would exacerbate any negative consequences to the Company. While the Company maintains insurance coverage for certain types of losses, such insurance coverage may be insufficient to cover all losses that may arise.\nApple Inc. | 2024 Form 10-K | 6\n\nGlobal markets for the Company’s products and services are highly competitive and subject to rapid technological change, and the Company may be unable to compete effectively in these markets.\nThe Company’s products and services are offered in highly competitive global markets characterized by aggressive price competition and resulting downward pressure on gross margins, frequent introduction of new products and services, short product life cycles, evolving industry standards, continual improvement in product price and performance characteristics, rapid adoption of technological advancements by competitors, and price sensitivity on the part of consumers and businesses.\nThe Company’s ability to compete successfully depends heavily on ensuring the continuing and timely introduction of innovative new products, services and technologies to the marketplace. The Company designs and develops nearly the entire solution for its products, including the hardware, operating system, numerous software applications and related services. As a result, the Company must make significant investments in R&D. There can be no assurance these investments will achieve expected returns, and the Company may not be able to develop and market new products and services successfully.\nThe Company currently holds a significant number of patents, trademarks and copyrights and has registered, and applied to register, additional patents, trademarks and copyrights. In contrast, many of the Company’s competitors seek to compete primarily through aggressive pricing and very low cost structures, and by imitating the Company’s products and infringing on its intellectual property. Effective intellectual property protection is not consistently available in every country in which the Company operates. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be materially adversely affected.\nThe Company has a minority market share in the global smartphone, personal computer and tablet markets. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software and digital content supplier relationships. In addition, some of the Company’s competitors have broader product lines, lower-priced products and a larger installed base of active devices. Competition has been particularly intense as competitors have aggressively cut prices and lowered product margins. Certain competitors have the resources, experience or cost structures to provide products at little or no profit or even at a loss. Some of the markets in which the Company competes have from time to time experienced little to no growth or contracted overall.\nAdditionally, the Company faces significant competition as competitors imitate the Company’s product features and applications within their products or collaborate to offer solutions that are more competitive than those they currently offer. The Company also expects competition to intensify as competitors imitate the Company’s approach to providing components seamlessly within their offerings or work collaboratively to offer integrated solutions.\nThe Company’s services also face substantial competition, including from companies that have significant resources and experience and have established service offerings with large customer bases. The Company competes with business models that provide content to users for free. The Company also competes with illegitimate means to obtain third-party digital content and applications.\nThe Company’s business, results of operations and financial condition depend substantially on the Company’s ability to continually improve its products and services to maintain their functional and design advantages. There can be no assurance the Company will be able to continue to provide products and services that compete effectively.\nBusiness Risks\nTo remain competitive and stimulate customer demand, the Company must successfully manage frequent introductions and transitions of products and services.\nDue to the highly volatile and competitive nature of the markets and industries in which the Company competes, the Company must continually introduce new products, services and technologies, enhance existing products and services, effectively stimulate customer demand for new and upgraded products and services, and successfully manage the transition to these new and upgraded products and services. The success of new product and service introductions depends on a number of factors, including timely and successful development, market acceptance, the Company’s ability to manage the risks associated with new technologies and production ramp-up issues, the availability of application software or other third-party support for the Company’s products and services, the effective management of purchase commitments and inventory levels in line with anticipated product demand, the availability of products in appropriate quantities and at expected costs to meet anticipated demand, and the risk that new products and services may have quality or other defects or deficiencies. New products, services and technologies may replace or supersede existing offerings and may produce lower revenues and lower profit margins, which can materially adversely impact the Company’s business, results of operations and financial condition. There can be no assurance the Company will successfully manage future introductions and transitions of products and services.\nApple Inc. | 2024 Form 10-K | 7\n\nThe Company depends on component and product manufacturing and logistical services provided by outsourcing partners, many of which are located outside of the U.S.\nSubstantially all of the Company’s manufacturing is performed in whole or in part by outsourcing partners located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam, and a significant concentration of this manufacturing is currently performed by a small number of outsourcing partners, often in single locations. The Company has also outsourced much of its transportation and logistics management. While these arrangements can lower operating costs, they also reduce the Company’s direct control over production and distribution. Such diminished control has from time to time and may in the future have an adverse effect on the quality or quantity of products manufactured or services provided, or adversely affect the Company’s flexibility to respond to changing conditions. Although arrangements with these partners may contain provisions for product defect expense reimbursement, the Company generally remains responsible to the consumer for warranty and out-of-warranty service in the event of product defects and experiences unanticipated product defect liabilities from time to time. While the Company relies on its partners to adhere to its supplier code of conduct, violations of the supplier code of conduct occur from time to time and can materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nThe Company relies on single-source outsourcing partners in the U.S., Asia and Europe to supply and manufacture many components, and on outsourcing partners primarily located in Asia, for final assembly of substantially all of the Company’s hardware products. Any failure of these partners to perform can have a negative impact on the Company’s cost or supply of components or finished goods. In addition, manufacturing or logistics in these locations or transit to final destinations can be disrupted for a variety of reasons, including natural and man-made disasters, information technology system failures, commercial disputes, economic, business, labor, environmental, public health or political issues, trade and other international disputes, geopolitical tensions, or conflict.\nThe Company has invested in manufacturing process equipment, much of which is held at certain of its outsourcing partners, and has made prepayments to certain of its suppliers associated with long-term supply agreements. While these arrangements help ensure the supply of components and finished goods, if these outsourcing partners or suppliers experience severe financial problems or other disruptions in their business, such continued supply can be disrupted or terminated, and the recoverability of manufacturing process equipment or prepayments can be negatively impacted.\nChanges or additions to the Company’s supply chain require considerable time and resources and involve significant risks and uncertainties, including exposure to additional regulatory and operational risks.\nFuture operating results depend upon the Company’s ability to obtain components in sufficient quantities on commercially reasonable terms.\nBecause the Company currently obtains certain components from single or limited sources, the Company is subject to significant supply and pricing risks. Many components, including those that are available from multiple sources, are at times subject to industry-wide shortages and significant commodity pricing fluctuations that can materially adversely affect the Company’s business, results of operations and financial condition. For example, the global semiconductor industry has in the past experienced high demand and shortages of supply, which adversely affected the Company’s ability to obtain sufficient quantities of components and products on commercially reasonable terms, or at all. Such disruptions could occur in the future. While the Company has entered into agreements for the supply of many components, there can be no assurance the Company will be able to extend or renew these agreements on similar terms, or at all. In addition, component suppliers may suffer from poor financial conditions, which can lead to business failure for the supplier or consolidation within a particular industry, further limiting the Company’s ability to obtain sufficient quantities of components on commercially reasonable terms, or at all. Therefore, the Company remains subject to significant risks of supply shortages and price increases that can materially adversely affect its business, results of operations and financial condition.\nThe Company’s new products often utilize custom components available from only one source. When a component or product uses new technologies, initial capacity constraints may exist until the suppliers’ yields have matured or their manufacturing capacities have increased. The continued availability of these components at acceptable prices, or at all, can be affected for any number of reasons, including if suppliers decide to concentrate on the production of common components instead of components customized to meet the Company’s requirements. When the Company’s supply of components for a new or existing product has been delayed or constrained, or when an outsourcing partner has delayed shipments of completed products to the Company, the Company’s business, results of operations and financial condition have been adversely affected and future delays or constraints could materially adversely affect the Company’s business, results of operations and financial condition. The Company’s business and financial performance could also be materially adversely affected depending on the time required to obtain sufficient quantities from the source, or to identify and obtain sufficient quantities from an alternative source.\nApple Inc. | 2024 Form 10-K | 8\n\nThe Company’s products and services may be affected from time to time by design and manufacturing defects that could materially adversely affect the Company’s business and result in harm to the Company’s reputation.\nThe Company offers complex hardware and software products and services that can be affected by design and manufacturing defects. Sophisticated operating system software and applications, such as those offered by the Company, often have issues that can unexpectedly interfere with the intended operation of hardware or software products and services. Defects can also exist in components and products the Company purchases from third parties. Component defects could make the Company’s products unsafe and create a risk of environmental or property damage and personal injury. These risks may increase as the Company’s products are introduced into specialized applications, including health. In addition, the Company’s service offerings can have quality issues and from time to time experience outages, service slowdowns or errors. As a result, from time to time the Company’s services have not performed as anticipated and may not meet customer expectations. The introduction of new and complex technologies, such as artificial intelligence features, can increase these and other safety risks, including exposing users to harmful, inaccurate or other negative content and experiences. There can be no assurance the Company will be able to detect and fix all issues and defects in the hardware, software and services it offers. Failure to do so can result in widespread technical and performance issues affecting the Company’s products and services. Errors, bugs and vulnerabilities can be exploited by third parties, compromising the safety and security of a user’s device. In addition, the Company can be exposed to product liability claims, recalls, product replacements or modifications, write-offs of inventory, property, plant and equipment or intangible assets, and significant warranty and other expenses, including litigation costs and regulatory fines. Quality problems can adversely affect the experience for users of the Company’s products and services, and result in harm to the Company’s reputation, loss of competitive advantage, poor market acceptance, reduced demand for products and services, delay in new product and service introductions and lost sales.\nThe Company is exposed to the risk of write-downs on the value of its inventory and other assets, in addition to purchase commitment cancellation risk.\nThe Company records a write-down for product and component inventories that have become obsolete or exceed anticipated demand, or for which cost exceeds net realizable value. The Company also accrues necessary cancellation fee reserves for orders of excess products and components. The Company reviews long-lived assets, including capital assets held at its suppliers’ facilities and inventory prepayments, for impairment whenever events or circumstances indicate the assets may not be recoverable. If the Company determines that an impairment has occurred, it records a write-down equal to the amount by which the carrying value of the asset exceeds its fair value. Although the Company believes its inventory, capital assets, inventory prepayments and other assets and purchase commitments are currently recoverable, there can be no assurance the Company will not incur write-downs, fees, impairments and other charges given the rapid and unpredictable pace of product obsolescence in the industries in which the Company competes.\nThe Company orders components for its products and builds inventory in advance of product announcements and shipments. Manufacturing purchase obligations cover the Company’s forecasted component and manufacturing requirements, typically for periods up to 150 days. Because the Company’s markets are volatile, competitive and subject to rapid technology and price changes, there is a risk the Company will forecast incorrectly and order or produce excess or insufficient amounts of components or products, or not fully utilize firm purchase commitments.\nThe Company relies on access to third-party intellectual property, which may not be available to the Company on commercially reasonable terms, or at all.\nThe Company’s products and services are designed to include intellectual property owned by third parties, which requires licenses from those third parties. In addition, because of technological changes in the industries in which the Company currently competes or in the future may compete, current extensive patent coverage and the rapid rate of issuance of new patents, the Company’s products and services can unknowingly infringe existing patents or intellectual property rights of others. From time to time, the Company has been notified that it may be infringing certain patents or other intellectual property rights of third parties. Based on experience and industry practice, the Company believes licenses to such third-party intellectual property can generally be obtained on commercially reasonable terms. However, there can be no assurance the necessary licenses can be obtained on commercially reasonable terms or at all. Failure to obtain the right to use third-party intellectual property, or to use such intellectual property on commercially reasonable terms, can require the Company to modify certain products, services or features or preclude the Company from selling certain products or services, or otherwise have a material adverse impact on the Company’s business, results of operations and financial condition.\nApple Inc. | 2024 Form 10-K | 9\n\nThe Company’s future performance depends in part on support from third-party software developers.\nThe Company believes decisions by customers to purchase its hardware products depend in part on the availability of third-party software applications and services. There can be no assurance third-party developers will continue to develop and maintain software applications and services for the Company’s products. If third-party software applications and services cease to be developed and maintained for the Company’s products, customers may choose not to buy the Company’s products.\nThe Company believes the availability of third-party software applications and services for its products depends in part on the developers’ perception and analysis of the relative benefits of developing, maintaining and upgrading such software and services for the Company’s products compared to competitors’ platforms, such as Android for smartphones and tablets, Windows for personal computers and tablets, and PlayStation, Nintendo and Xbox for gaming platforms. This analysis may be based on factors such as the market position of the Company and its products, the anticipated revenue that may be generated, expected future growth of product sales, and the costs of developing such applications and services.\nThe Company’s minority market share in the global smartphone, personal computer and tablet markets can make developers less inclined to develop or upgrade software for the Company’s products and more inclined to devote their resources to developing and upgrading software for competitors’ products with larger market share. When developers focus their efforts on these competing platforms, the availability and quality of applications for the Company’s devices can suffer.\nThe Company relies on the continued availability and development of compelling and innovative software applications for its products. The Company’s products and operating systems are subject to rapid technological change, and when third-party developers are unable to or choose not to keep up with this pace of change, their applications can fail to take advantage of these changes to deliver improved customer experiences, can operate incorrectly, and can result in dissatisfied customers and lower customer demand for the Company’s products.\nThe Company distributes third-party applications for its products through the App Store. For the vast majority of applications, developers keep all of the revenue they generate on the App Store. Where applicable, the Company retains a commission from sales of applications and sales of digital services or goods initiated within an application. From time to time, the Company has made changes to its products and services, including taking actions in response to litigation, competition, market conditions and legal and regulatory requirements, and expects to make further business changes in the future. For example, in the U.S., the Company has implemented changes to how developers communicate with consumers within apps on the U.S. storefront of the iOS and iPadOS App Store regarding alternative purchasing mechanisms. The Company has also implemented changes to iOS, iPadOS, the App Store and Safari® in the European Union (“EU”) as it seeks to comply with the Digital Markets Act (the “DMA”), including new business terms and alternative fee structures for iOS and iPadOS apps, alternative methods of distribution for iOS and iPadOS apps, alternative payment processing for apps across the Company’s operating systems, and additional tools and application programming interfaces (“APIs”) for developers. Changes to the Company’s products and services could materially adversely affect the Company’s business, results of operations and financial condition, including if such business changes result in reduced App Store or other sales, reductions in the rate of the commission that the Company retains on such sales, or if the rate of the commission is otherwise narrowed in scope or eliminated.\nFailure to obtain or create digital content that appeals to the Company’s customers, or to make such content available on commercially reasonable terms, could have a material adverse impact on the Company’s business, results of operations and financial condition.\nThe Company contracts with numerous third parties to offer their digital content to customers. This includes the right to sell, or offer subscriptions to, third-party content, as well as the right to incorporate specific content into the Company’s own services. The licensing or other distribution arrangements for this content can be for relatively short time periods and do not guarantee the continuation or renewal of these arrangements on commercially reasonable terms, or at all. Some third-party content providers and distributors currently or in the future may offer competing products and services, and can take actions to make it difficult or impossible for the Company to license or otherwise distribute their content. Other content owners, providers or distributors may seek to limit the Company’s access to, or increase the cost of, such content. The Company may be unable to continue to offer a wide variety of content at commercially reasonable prices with acceptable usage rules.\nThe Company also produces its own digital content, which can be costly to produce due to intense and increasing competition for talent, content and subscribers, and may fail to appeal to the Company’s customers.\nSome third-party digital content providers require the Company to provide digital rights management and other security solutions. If requirements change, the Company may have to develop or license new technology to provide these solutions. There can be no assurance the Company will be able to develop or license such solutions at a reasonable cost and in a timely manner.\nApple Inc. | 2024 Form 10-K | 10\n\nThe Company’s success depends largely on the talents and efforts of its team members, the continued service and availability of highly skilled employees, including key personnel, and the Company’s ability to nurture its distinctive and inclusive culture.\nMuch of the Company’s future success depends on the talents and efforts of its team members and the continued availability and service of key personnel, including its Chief Executive Officer, executive team and other highly skilled employees. Experienced personnel in the technology industry are in high demand and competition for their talents is intense, especially in Silicon Valley, where most of the Company’s key personnel are located. In addition to intense competition for talent, workforce dynamics are constantly evolving. If the Company does not manage changing workforce dynamics effectively, it could materially adversely affect the Company’s culture, reputation and operational flexibility.\nThe Company believes that its distinctive and inclusive culture is a significant driver of its success. If the Company is unable to nurture its culture, it could materially adversely affect the Company’s ability to recruit and retain the highly skilled employees who are critical to its success, and could otherwise materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nThe Company depends on the performance of carriers, wholesalers, retailers and other resellers.\nThe Company distributes its products and certain of its services through cellular network carriers, wholesalers, retailers and resellers, many of which distribute products and services from competitors. The Company also sells its products and services and resells third-party products in most of its major markets directly to consumers, small and mid-sized businesses, and education, enterprise and government customers through its retail and online stores and its direct sales force.\nSome carriers providing cellular network service for the Company’s products offer financing, installment payment plans or subsidies for users’ purchases of the device. There can be no assurance such offers will be continued at all or in the same amounts.\nThe Company has invested and will continue to invest in programs to enhance reseller sales, including staffing selected resellers’ stores with Company employees and contractors, and improving product placement displays. These programs can require a substantial investment while not assuring return or incremental sales. The financial condition of these resellers could weaken, these resellers could stop distributing the Company’s products, or uncertainty regarding demand for some or all of the Company’s products could cause resellers to reduce their ordering and marketing of the Company’s products.\nThe Company’s business and reputation are impacted by information technology system failures and network disruptions.\nThe Company and its global supply chain are dependent on complex information technology systems and are exposed to information technology system failures or network disruptions caused by natural disasters, accidents, power disruptions, telecommunications failures, acts of terrorism or war, computer viruses, physical or electronic break-ins, ransomware or other cybersecurity incidents, or other events or disruptions. System upgrades, redundancy and other continuity measures may be ineffective or inadequate, and the Company’s or its vendors’ business continuity and disaster recovery planning may not be sufficient for all eventualities. Such failures or disruptions can adversely impact the Company’s business by, among other things, preventing access to the Company’s online services, interfering with customer transactions or impeding the manufacturing and shipping of the Company’s products. These events could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nLosses or unauthorized access to or releases of confidential information, including personal information, could subject the Company to significant reputational, financial, legal and operational consequences.\nThe Company’s business requires it to use and store confidential information, including personal information with respect to the Company’s customers and employees. The Company devotes significant resources to systems and data security, including through the use of encryption and other security measures intended to protect its systems and data. But these measures cannot provide absolute security, and losses or unauthorized access to or releases of confidential information occur and could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nThe Company’s business also requires it to share confidential information with suppliers and other third parties. The Company relies on global suppliers that are also exposed to ransomware and other malicious attacks that can disrupt business operations. Although the Company takes steps to secure confidential information that is provided to or accessible by third parties working on the Company’s behalf, such measures are not always effective and losses or unauthorized access to, or releases of, confidential information occur. Such incidents and other malicious attacks could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nApple Inc. | 2024 Form 10-K | 11\n\nThe Company experiences malicious attacks and other attempts to gain unauthorized access to its systems on a regular basis. These attacks seek to compromise the confidentiality, integrity or availability of confidential information or disrupt normal business operations, and can, among other things, impair the Company’s ability to attract and retain customers for its products and services, impact the Company’s stock price, materially damage commercial relationships, and expose the Company to litigation or government investigations, which can result in penalties, fines or judgments against the Company. Globally, attacks are expected to continue accelerating in both frequency and sophistication with increasing use by actors of tools and techniques that are designed to circumvent controls, avoid detection, and remove or obfuscate forensic evidence, all of which hinders the Company’s ability to identify, investigate and recover from incidents. In addition, attacks against the Company and its customers can escalate during periods of geopolitical tensions or conflict.\nAlthough malicious attacks perpetrated to gain access to confidential information, including personal information, affect many companies across various industries, the Company is at a relatively greater risk of being targeted because of its high profile and the value of the confidential information it creates, owns, manages, stores and processes.\nThe Company has implemented systems and processes intended to secure its information technology systems and prevent unauthorized access to or loss of sensitive data, and mitigate the impact of unauthorized access, including through the use of encryption and authentication technologies. As with all companies, these security measures may not be sufficient for all eventualities and are vulnerable to hacking, ransomware attacks, employee error, malfeasance, system error, faulty password management or other irregularities. For example, third parties can fraudulently induce the Company’s or its suppliers’ and other third parties’ employees or customers into disclosing usernames, passwords or other sensitive information, which can, in turn, be used for unauthorized access to the Company’s or such suppliers’ or third parties’ systems and services. To help protect customers and the Company, the Company deploys and makes available technologies like multifactor authentication, monitors its services and systems for unusual activity and may freeze accounts under suspicious circumstances, which, among other things, can result in the delay or loss of customer orders or impede customer access to the Company’s products and services.\nWhile the Company maintains insurance coverage that is intended to address certain aspects of data security risks, such insurance coverage may be insufficient to cover all losses or all types of claims that may arise.\nInvestment in new business strategies and acquisitions could disrupt the Company’s ongoing business, present risks not originally contemplated and materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nThe Company has invested, and in the future may invest, in new business strategies or acquisitions. Such endeavors may involve significant risks and uncertainties, including distraction of management from current operations, greater-than-expected liabilities and expenses, economic, political, legal and regulatory challenges associated with operating in new businesses, regions or countries, inadequate return on capital, potential impairment of tangible and intangible assets, and significant write-offs. Investment and acquisition transactions are exposed to additional risks, including failing to obtain required regulatory approvals on a timely basis or at all, or the imposition of onerous conditions that could delay or prevent the Company from completing a transaction or otherwise limit the Company’s ability to fully realize the anticipated benefits of a transaction. These new ventures are inherently risky and may not be successful. The failure of any significant investment could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nThe Company’s retail stores are subject to numerous risks and uncertainties.\nThe Company’s retail operations are subject to many factors that pose risks and uncertainties and could adversely impact the Company’s business, results of operations and financial condition, including macroeconomic factors that could have an adverse effect on general retail activity. Other factors include the Company’s ability to: manage costs associated with retail store construction and operation; manage relationships with existing retail partners; manage costs associated with fluctuations in the value of retail inventory; and obtain and renew leases in quality retail locations at a reasonable cost.\nApple Inc. | 2024 Form 10-K | 12\n\nLegal and Regulatory Compliance Risks\nThe Company’s business, results of operations and financial condition could be adversely impacted by unfavorable results of legal proceedings or government investigations.\nThe Company is subject to various claims, legal proceedings and government investigations that have arisen in the ordinary course of business and have not yet been fully resolved, and new matters may arise in the future. In addition, agreements entered into by the Company sometimes include indemnification provisions which can subject the Company to costs and damages in the event of a claim against an indemnified third party. The number of claims, legal proceedings and government investigations involving the Company, and the alleged magnitude of such claims, proceedings and government investigations, has generally increased over time and may continue to increase.\nThe Company has faced and continues to face a significant number of patent claims relating to its cellular-enabled products, and new claims may arise in the future, including as a result of new legal or regulatory frameworks. For example, technology and other patent-holding companies frequently assert their patents and seek royalties and often enter into litigation based on allegations of patent infringement or other violations of intellectual property rights. The Company is vigorously defending infringement actions in courts in several U.S. jurisdictions, as well as internationally in various countries. The plaintiffs in these actions frequently seek broad injunctive relief and substantial damages.\nRegardless of the merit of particular claims, defending against litigation or responding to government investigations can be expensive, time-consuming and disruptive to the Company’s operations. In recognition of these considerations, the Company may enter into agreements or other arrangements to settle litigation and resolve such challenges. There can be no assurance such agreements can be obtained on acceptable terms or that litigation will not occur. These agreements can also significantly increase the Company’s cost of sales and operating expenses and require the Company to change its business practices and limit the Company’s ability to offer certain products and services.\nThe outcome of litigation or government investigations is inherently uncertain. If one or more legal matters were resolved against the Company or an indemnified third party in a reporting period for amounts above management’s expectations, the Company’s results of operations and financial condition for that reporting period could be materially adversely affected. Further, such an outcome can result in significant monetary damages, disgorgement of revenue or profits, remedial corporate measures or injunctive relief against the Company, and has from time to time required, and can in the future require, the Company to change its business practices and limit the Company’s ability to develop, manufacture, use, import or offer for sale certain products and services, all of which could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nWhile the Company maintains insurance coverage for certain types of claims, such insurance coverage may be insufficient to cover all losses or all types of claims that may arise.\nThe Company is subject to complex and changing laws and regulations worldwide, which exposes the Company to potential liabilities, increased costs and other adverse effects on the Company’s business.\nThe Company’s global operations are subject to complex and changing laws and regulations on subjects, including antitrust; privacy, data security and data localization; consumer protection; advertising, sales, billing and e-commerce; financial services and technology; product liability; intellectual property ownership and infringement; digital platforms; machine learning and artificial intelligence; internet, telecommunications and mobile communications; media, television, film and digital content; availability of third-party software applications and services; labor and employment; anticorruption; import, export and trade; foreign exchange controls and cash repatriation restrictions; anti–money laundering; foreign ownership and investment; tax; and environmental, health and safety, including electronic waste, recycling, product design and climate change.\nCompliance with these laws and regulations is onerous and expensive. New and changing laws and regulations can adversely affect the Company’s business by increasing the Company’s costs, limiting the Company’s ability to offer a product, service or feature to customers, imposing changes to the design of the Company’s products and services, impacting customer demand for the Company’s products and services, and requiring changes to the Company’s business or supply chain. New and changing laws and regulations can also create uncertainty about how such laws and regulations will be interpreted and applied. These risks and costs may increase as the Company’s products and services are introduced into specialized applications, including health and financial services, or as the Company expands the use of technologies, such as machine learning and artificial intelligence features, and must navigate new legal, regulatory and ethical considerations relating to such technologies. The Company has implemented policies and procedures designed to ensure compliance with applicable laws and regulations, but there can be no assurance the Company’s employees, contractors or agents will not violate such laws and regulations or the Company’s policies and procedures. If the Company is found to have violated laws and regulations, it could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nApple Inc. | 2024 Form 10-K | 13\n\nRegulatory changes and other actions that materially adversely affect the Company’s business may be announced with little or no advance notice and the Company may not be able to effectively mitigate all adverse impacts from such measures. For example, the Company is subject to changing regulations relating to the export and import of its products. Although the Company has programs, policies and procedures in place that are designed to satisfy regulatory requirements, there can be no assurance that such policies and procedures will be effective in preventing a violation or a claim of a violation. As a result, the Company’s products could be banned, delayed or prohibited from importation, which could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nExpectations relating to environmental, social and governance considerations and related reporting obligations expose the Company to potential liabilities, increased costs, reputational harm, and other adverse effects on the Company’s business.\nMany governments, regulators, investors, employees, customers and other stakeholders are increasingly focused on environmental, social and governance considerations relating to businesses, including climate change and greenhouse gas emissions, human and civil rights, and diversity, equity and inclusion. In addition, the Company makes statements about its goals and initiatives through its various non-financial reports, information provided on its website, press statements and other communications. Responding to these environmental, social and governance considerations and implementation of the Company’s announced goals and initiatives involves risks and uncertainties, requires investments, and depends in part on third-party performance or data that is outside the Company’s control. The Company cannot guarantee that it will achieve its announced environmental, social and governance goals and initiatives. In addition, some stakeholders may disagree with the Company’s goals and initiatives. Any failure, or perceived failure, by the Company to achieve its goals, further its initiatives, adhere to its public statements, comply with federal, state and international environmental, social and governance laws and regulations, or meet evolving and varied stakeholder expectations and standards could result in legal and regulatory proceedings against the Company and materially adversely affect the Company’s business, reputation, results of operations, financial condition and stock price.\nThe technology industry, including, in some instances, the Company, is subject to intense media, political and regulatory scrutiny, which exposes the Company to increasing regulation, government investigations, legal actions and penalties.\nFrom time to time, the Company has made changes to its App Store, including actions taken in response to litigation, competition, market conditions and legal and regulatory requirements. The Company expects to make further business changes in the future. For example, in the U.S. the Company has implemented changes to how developers communicate with consumers within apps on the U.S. storefront of the iOS and iPadOS App Store regarding alternative purchasing mechanisms.\nThe Company has also implemented changes to iOS, iPadOS, the App Store and Safari in the EU as it seeks to comply with the DMA, including new business terms and alternative fee structures for iOS and iPadOS apps, alternative methods of distribution for iOS and iPadOS apps, alternative payment processing for apps across the Company’s operating systems, and additional tools and APIs for developers. The Company has also continued to make changes to its compliance plan in response to feedback and engagement with the European Commission (the “Commission”). Although the Company’s compliance plan is intended to address the DMA’s obligations, it has been challenged by the Commission and may be challenged further by private litigants. The DMA provides for significant fines and penalties for noncompliance, and other jurisdictions may seek to require the Company to make changes to its business. While the changes introduced by the Company in the EU are intended to reduce new privacy and security risks that the DMA poses to EU users, many risks will remain.\nThe Company is also currently subject to antitrust investigations and litigation in various jurisdictions around the world, which can result in legal proceedings and claims against the Company that could, individually or in the aggregate, have a materially adverse impact on the Company’s business, results of operations and financial condition. For example, the Company is subject to civil antitrust lawsuits in the U.S. alleging monopolization or attempted monopolization in the markets for “performance smartphones” and “smartphones” generally in violation of U.S. antitrust laws. In addition, the Company is the subject of investigations in Europe and other jurisdictions relating to App Store terms and conditions. If such investigations or litigation are resolved against the Company, the Company can be exposed to significant fines and may be required to make further changes to its business practices, all of which could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nFurther, the Company has commercial relationships with other companies in the technology industry that are or may become subject to investigations and litigation that, if resolved against those other companies, could materially adversely affect the Company’s commercial relationships with those business partners and materially adversely affect the Company’s business, results of operations and financial condition. For example, the Company earns revenue from licensing arrangements with Google LLC and other companies to offer their search services on the Company’s platforms and applications, and certain of these arrangements are currently subject to government investigations and legal proceedings.\nApple Inc. | 2024 Form 10-K | 14\n\nThere can be no assurance the Company’s business will not be materially adversely affected, individually or in the aggregate, by the outcomes of such investigations, litigation or changes to laws and regulations in the future. Changes to the Company’s business practices to comply with new laws and regulations or in connection with other legal proceedings can negatively impact the reputation of the Company’s products for privacy and security and otherwise adversely affect the experience for users of the Company’s products and services, and result in harm to the Company’s reputation, loss of competitive advantage, poor market acceptance, reduced demand for products and services, and lost sales.\nThe Company’s business is subject to a variety of U.S. and international laws, rules, policies and other obligations regarding data protection.\nThe Company is subject to an increasing number of federal, state and international laws relating to the collection, use, retention, security and transfer of various types of personal information. In many cases, these laws apply not only to third-party transactions, but also restrict transfers of personal information among the Company and its international subsidiaries. Several jurisdictions have passed laws in this area, and additional jurisdictions are considering imposing additional restrictions or have laws that are pending. These laws continue to develop and may be inconsistent from jurisdiction to jurisdiction. Complying with emerging and changing requirements causes the Company to incur substantial costs and has required and may in the future require the Company to change its business practices. Noncompliance could result in significant penalties or legal liability.\nThe Company makes statements about its use and disclosure of personal information through its privacy policy, information provided on its website, press statements and other privacy notices provided to customers. Any failure by the Company to comply with these public statements or with federal, state or international privacy or data protection laws and regulations could result in inquiries or proceedings against the Company by governmental entities or others. In addition to reputational impacts, penalties could include ongoing audit requirements and significant legal liability.\nIn addition to the risks generally relating to the collection, use, retention, security and transfer of personal information, the Company is also subject to specific obligations relating to information considered sensitive under applicable laws, such as health data, financial data and biometric data. Health data and financial data are subject to additional privacy, security and breach notification requirements, and the Company is subject to audit by governmental authorities regarding the Company’s compliance with these obligations. If the Company fails to adequately comply with these rules and requirements, or if health data or financial data is handled in a manner not permitted by law or under the Company’s agreements with healthcare or financial institutions, the Company can be subject to litigation or government investigations, and can be liable for associated investigatory expenses, and can also incur significant fees or fines.\nPayment card data is also subject to additional requirements. Under payment card rules and obligations, if cardholder information is potentially compromised, the Company can be liable for associated investigatory expenses and can also incur significant fees or fines if the Company fails to follow payment card industry data security standards. The Company could also experience a significant increase in payment card transaction costs or lose the ability to process payment cards if it fails to follow payment card industry data security standards, which could materially adversely affect the Company’s business, reputation, results of operations and financial condition.\nFinancial Risks\nThe Company expects its quarterly net sales and results of operations to fluctuate.\nThe Company’s profit margins vary across its products, services, geographic segments and distribution channels. For example, the gross margins on the Company’s products and services vary significantly and can change over time. The Company’s gross margins are subject to volatility and downward pressure due to a variety of factors, including: continued industry-wide global product pricing pressures and product pricing actions that the Company may take in response to such pressures; increased competition; the Company’s ability to effectively stimulate demand for certain of its products and services; compressed product life cycles; supply shortages; potential increases in the cost of components, outside manufacturing services, and developing, acquiring and delivering content for the Company’s services; the Company’s ability to manage product quality and warranty costs effectively; shifts in the mix of products and services, or in the geographic, currency or channel mix, including to the extent that regulatory changes require the Company to modify its product and service offerings; fluctuations in foreign exchange rates; inflation and other macroeconomic pressures; and the introduction of new products or services, including new products or services with lower profit margins. These and other factors could have a materially adverse impact on the Company’s results of operations and financial condition.\nThe Company has historically experienced higher net sales in its first quarter compared to other quarters in its fiscal year due in part to seasonal holiday demand. Additionally, new product and service introductions can significantly impact net sales, cost of sales and operating expenses. Further, the Company generates a significant portion of its net sales from a single product and a decline in demand for that product could significantly impact quarterly net sales. The Company could also be subject to unexpected developments, such as lower-than-anticipated demand for the Company’s products or services, issues with new product or service introductions, information technology system failures or network disruptions, or failure of one of the Company’s logistics, supply or manufacturing partners.\nApple Inc. | 2024 Form 10-K | 15\n\nThe Company’s financial performance is subject to risks associated with changes in the value of the U.S. dollar relative to local currencies.\nThe Company’s primary exposure to movements in foreign exchange rates relates to non–U.S. dollar–denominated sales, cost of sales and operating expenses worldwide. Gross margins on the Company’s products in foreign countries and on products that include components obtained from foreign suppliers have in the past been adversely affected and could in the future be materially adversely affected by foreign exchange rate fluctuations.\nThe weakening of foreign currencies relative to the U.S. dollar adversely affects the U.S. dollar value of the Company’s foreign currency–denominated sales and earnings, and generally leads the Company to raise international pricing, potentially reducing demand for the Company’s products. In some circumstances, for competitive or other reasons, the Company may decide not to raise international pricing to offset the U.S. dollar’s strengthening, which would adversely affect the U.S. dollar value of the gross margins the Company earns on foreign currency–denominated sales.\nConversely, a strengthening of foreign currencies relative to the U.S. dollar, while generally beneficial to the Company’s foreign currency–denominated sales and earnings, could cause the Company to reduce international pricing or incur losses on its foreign currency derivative instruments, thereby limiting the benefit. Additionally, strengthening of foreign currencies may increase the Company’s cost of product components denominated in those currencies, thus adversely affecting gross margins.\nThe Company uses derivative instruments, such as foreign currency forward and option contracts, to hedge certain exposures to fluctuations in foreign exchange rates. The use of such hedging activities may not be effective to offset any, or more than a portion, of the adverse financial effects of unfavorable movements in foreign exchange rates over the limited time the hedges are in place.\nThe Company is exposed to credit risk and fluctuations in the values of its investment portfolio.\nThe Company’s investments can be negatively affected by changes in liquidity, credit deterioration, financial results, market and economic conditions, political risk, sovereign risk, interest rate fluctuations or other factors. As a result, the value and liquidity of the Company’s cash, cash equivalents and marketable securities may fluctuate substantially. Although the Company has not realized significant losses on its cash, cash equivalents and marketable securities, future fluctuations in their value could result in significant losses and could have a material adverse impact on the Company’s results of operations and financial condition.\nThe Company is exposed to credit risk on its trade accounts receivable, vendor non-trade receivables and prepayments related to long-term supply agreements, and this risk is heightened during periods when economic conditions worsen.\nThe Company distributes its products and certain of its services through third-party cellular network carriers, wholesalers, retailers and resellers. The Company also sells its products and services directly to small and mid-sized businesses and education, enterprise and government customers. A substantial majority of the Company’s outstanding trade receivables are not covered by collateral, third-party bank support or financing arrangements, or credit insurance, and a significant portion of the Company’s trade receivables can be concentrated within cellular network carriers or other resellers. The Company’s exposure to credit and collectibility risk on its trade receivables is higher in certain international markets and its ability to mitigate such risks may be limited. The Company also has unsecured vendor non-trade receivables resulting from purchases of components by outsourcing partners and other vendors that manufacture subassemblies or assemble final products for the Company. In addition, the Company has made prepayments associated with long-term supply agreements to secure supply of inventory components. As of September 28, 2024, the Company’s vendor non-trade receivables and prepayments related to long-term supply agreements were concentrated among a few individual vendors located primarily in Asia. While the Company has procedures to monitor and limit exposure to credit risk on its trade and vendor non-trade receivables, as well as long-term prepayments, there can be no assurance such procedures will effectively limit its credit risk and avoid losses.\nThe Company is subject to changes in tax rates, the adoption of new U.S. or international tax legislation and exposure to additional tax liabilities.\nThe Company is subject to taxes in the U.S. and numerous foreign jurisdictions, including Ireland and Singapore, where a number of the Company’s subsidiaries are organized. Due to economic and political conditions, tax laws and tax rates for income taxes and other non-income taxes in various jurisdictions may be subject to significant change. For example, the Organisation for Economic Co-operation and Development continues to advance proposals for modernizing international tax rules, including the introduction of global minimum tax standards. The Company’s effective tax rates are affected by changes in the mix of earnings in countries with differing statutory tax rates, changes in the valuation of deferred tax assets and liabilities, the introduction of new taxes, and changes in tax laws or their interpretation. The application of tax laws may be uncertain, require significant judgment and be subject to differing interpretations.\nApple Inc. | 2024 Form 10-K | 16\n\nThe Company is also subject to the examination of its tax returns and other tax matters by the U.S. Internal Revenue Service and other tax authorities and governmental bodies. The Company regularly assesses the likelihood of an adverse outcome resulting from these examinations to determine the adequacy of its provision for taxes. There can be no assurance as to the outcome of these examinations. If the Company’s effective tax rates were to increase, or if the ultimate determination of the Company’s taxes owed is for an amount in excess of amounts previously accrued, the Company’s business, results of operations and financial condition could be materially adversely affected.\nGeneral Risks\nThe price of the Company’s stock is subject to volatility.\nThe Company’s stock has experienced substantial price volatility in the past and may continue to do so in the future. Additionally, the Company, the technology industry and the stock market as a whole have, from time to time, experienced extreme stock price and volume fluctuations that have affected stock prices in ways that may have been unrelated to these companies’ operating performance. Price volatility may cause the average price at which the Company repurchases its stock in a given period to exceed the stock’s price at a given point in time. The Company believes the price of its stock should reflect expectations of future growth and profitability. The Company also believes the price of its stock should reflect expectations that its cash dividend will continue at current levels or grow, and that its current share repurchase program will be fully consummated. Future dividends are subject to declaration by the Company’s Board of Directors (the “Board”), and the Company’s share repurchase program does not obligate it to acquire any specific number of shares. If the Company fails to meet expectations related to future growth, profitability, dividends, share repurchases or other market expectations, the price of the Company’s stock may decline significantly, which could have a material adverse impact on investor confidence and employee retention.\nItem 1B. Unresolved Staff Comments\nNone.\nItem 1C. Cybersecurity\nThe Company’s management, led by its Head of Corporate Information Security, has overall responsibility for identifying, assessing and managing any material risks from cybersecurity threats. The Company’s Head of Corporate Information Security leads a dedicated Information Security team of highly skilled individuals with experience across industries that, among other things, develops and distributes information security policies, standards and procedures; engages in employee cybersecurity training; implements security controls; assesses security risk and compliance posture; monitors and responds to security events; and executes security testing and assessments. The Company’s Head of Corporate Information Security has extensive knowledge and skills gained from over 25 years of experience in the cybersecurity industry, including serving in leadership positions at other large technology companies and leading the Company’s Information Security team since 2016.\nThe Company’s Information Security team coordinates with teams across the Company to prevent, respond to and manage security incidents, and engages third parties, as appropriate, to assess, test or otherwise assist with aspects of its security processes and incident response. A dedicated Supplier Trust team manages information security risks the Company is exposed to through its supplier relationships. The Company has processes to log, track, address, and escalate for further assessment and report, as appropriate, cybersecurity incidents across the Company and its suppliers to senior management and the Audit and Finance Committee (the “Audit Committee”) of the Board. The Company’s enterprise risk management program is designed to identify, assess, and monitor the Company’s business risks, including financial, operational, compliance and reputational risks, and reflects management’s assessment of cybersecurity risks.\nThe Audit Committee assists the Board in the oversight and monitoring of cybersecurity matters. The Audit Committee regularly reviews and discusses the Company’s cybersecurity risks with management, including the Company’s Head of Corporate Information Security, its General Counsel and the Heads of Compliance and Business Conduct, Business Assurance, and Internal Audit, and receives updates, as necessary, regarding cybersecurity incidents. The Chair of the Audit Committee regularly reports the substance of such reviews and discussions to the Board, as necessary, and recommends to the Board such actions as the Audit Committee deems appropriate.\nFor a discussion of the Company’s cybersecurity-related risks, see Item 1A of this Form 10-K under the heading “Risk Factors.”\nApple Inc. | 2024 Form 10-K | 17\n\nItem 2. Properties\nThe Company’s headquarters is located in Cupertino, California. As of September 28, 2024, the Company owned or leased facilities and land for corporate functions, R&D, data centers, retail and other purposes at locations throughout the U.S. and in various places outside the U.S. The Company believes its existing facilities and equipment, which are used by all reportable segments, are in good operating condition and are suitable for the conduct of its business.\nItem 3. Legal Proceedings\nDigital Markets Act Investigations\nOn March 25, 2024, the Commission announced that it had opened two formal noncompliance investigations against the Company under the DMA. The Commission’s investigations concern (1) Article 5(4) of the DMA, which relates to how developers may communicate and promote offers to end users for apps distributed through the App Store as well as how developers may conclude contracts with those end users; and (2) Article 6(3) of the DMA, which relates to default settings, uninstallation of apps, and a web browser choice screen on iOS. On June 24, 2024, the Commission announced its preliminary findings in the Article 5(4) investigation alleging that the Company’s App Store rules are in breach of the DMA and announced that it had opened a third formal investigation against the Company regarding whether the Company’s new contractual requirements for third-party app developers and app marketplaces may violate the DMA. If the Commission makes a final determination that there has been a violation, it can issue a cease and desist order and may impose fines up to 10% of the Company’s annual worldwide net sales. Although any decision by the Commission can be appealed to the General Court of the EU, the effectiveness of the Commission’s order would apply immediately while the appeal is pending, unless a stay of the order is granted. The Company believes that it complies with the DMA and has continued to make changes to its compliance plan in response to feedback and engagement with the Commission.\nDepartment of Justice Lawsuit\nOn March 21, 2024, the U.S. Department of Justice (the “DOJ”) and a number of state and district attorneys general filed a civil antitrust lawsuit in the U.S. District Court for the District of New Jersey against the Company alleging monopolization or attempted monopolization in the markets for “performance smartphones” and “smartphones” in violation of U.S. antitrust laws. The DOJ is seeking equitable relief to redress the alleged anticompetitive behavior. In addition, various civil litigation matters have been filed in state and federal courts in the U.S. alleging similar violations of U.S. antitrust laws and seeking monetary damages and other nonmonetary relief. The Company believes it has substantial defenses and intends to vigorously defend itself.\nEpic Games\nEpic Games, Inc. (“Epic”) filed a lawsuit in the U.S. District Court for the Northern District of California (the “California District Court”) against the Company alleging violations of federal and state antitrust laws and California’s unfair competition law based upon the Company’s operation of its App Store. The California District Court found that certain provisions of the Company’s App Store Review Guidelines violate California’s unfair competition law and issued an injunction enjoining the Company from prohibiting developers from including in their apps external links that direct customers to purchasing mechanisms other than Apple in-app purchasing. The injunction applies to apps on the U.S. storefront of the iOS and iPadOS App Store. On January 16, 2024, the Company implemented a plan to comply with the injunction and filed a statement of compliance with the California District Court. A motion by Epic disputing the Company’s compliance plan and seeking to enforce the injunction, which the Company has opposed, is pending before the California District Court. On September 30, 2024, the Company filed a motion with the California District Court to narrow or vacate the injunction. The Company believes it has substantial defenses and intends to vigorously defend itself.\nOther Legal Proceedings\nThe Company is subject to other legal proceedings and claims that have not been fully resolved and that have arisen in the ordinary course of business. The Company settled certain matters during the fourth quarter of 2024 that did not individually or in the aggregate have a material impact on the Company’s financial condition or operating results. The outcome of litigation is inherently uncertain. If one or more legal matters were resolved against the Company in a reporting period for amounts above management’s expectations, the Company’s financial condition and operating results for that reporting period could be materially adversely affected.\nItem 4. Mine Safety Disclosures\nNot applicable.\nApple Inc. | 2024 Form 10-K | 18\n\nPART II\nItem 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities\nThe Company’s common stock is traded on The Nasdaq Stock Market LLC under the symbol AAPL.\nHolders\nAs of October 18, 2024, there were 23,301 shareholders of record.\nPurchases of Equity Securities by the Issuer and Affiliated Purchasers\nShare repurchase activity during the three months ended September 28, 2024 was as follows (in millions, except number of shares, which are reflected in thousands, and per-share amounts):\nPeriods\t\tTotal Number\nof Shares Purchased\t\tAverage Price\nPaid Per Share\t\tTotal Number of Shares\nPurchased as Part of Publicly\nAnnounced Plans or Programs\t\t\nApproximate Dollar Value of\nShares That May Yet Be Purchased\nUnder the Plans or Programs (1)\nJune 30, 2024 to August 3, 2024:\nOpen market and privately negotiated purchases\t\t35,697 \t\t\t$\t224.11 \t\t\t35,697 \t\t\t\nAugust 4, 2024 to August 31, 2024:\nOpen market and privately negotiated purchases\t\t42,910 \t\t\t$\t221.39 \t\t\t42,910 \t\t\t\nSeptember 1, 2024 to September 28, 2024:\nOpen market and privately negotiated purchases\t\t33,653 \t\t\t$\t222.86 \t\t\t33,653 \t\t\t\nTotal\t\t112,260 \t\t\t\t\t\t\t$\t89,074 \t\n \n(1)As of September 28, 2024, the Company was authorized by the Board to purchase up to $110 billion of the Company’s common stock under a share repurchase program announced on May 2, 2024, of which $20.9 billion had been utilized. During the fourth quarter of 2024, the Company also utilized the final $4.1 billion under its previous repurchase program, which was authorized in May 2023. The programs do not obligate the Company to acquire a minimum amount of shares. Under the programs, shares may be repurchased in privately negotiated or open market transactions, including under plans complying with Rule 10b5-1 under the Exchange Act.\nApple Inc. | 2024 Form 10-K | 19\n\nCompany Stock Performance\nThe following graph shows a comparison of five-year cumulative total shareholder return, calculated on a dividend-reinvested basis, for the Company, the S&P 500 Index and the Dow Jones U.S. Technology Supersector Index. The graph assumes $100 was invested in each of the Company’s common stock, the S&P 500 Index and the Dow Jones U.S. Technology Supersector Index as of the market close on September 27, 2019. Past stock price performance is not necessarily indicative of future stock price performance.\n2218\nSeptember 2019\t\tSeptember 2020\t\tSeptember 2021\t\tSeptember 2022\t\tSeptember 2023\t\tSeptember 2024\nApple Inc.\t\t$\t100 \t\t\t$\t207 \t\t\t$\t273 \t\t\t$\t281 \t\t\t$\t322 \t\t\t$\t430 \t\nS&P 500 Index\t\t$\t100 \t\t\t$\t113 \t\t\t$\t156 \t\t\t$\t131 \t\t\t$\t155 \t\t\t$\t210 \t\nDow Jones U.S. Technology Supersector Index\t\t$\t100 \t\t\t$\t146 \t\t\t$\t216 \t\t\t$\t156 \t\t\t$\t215 \t\t\t$\t322 \t\n \nItem 6. [Reserved]\nApple Inc. | 2024 Form 10-K | 20\n\nItem 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations\nThe following discussion should be read in conjunction with the consolidated financial statements and accompanying notes included in Part II, Item 8 of this Form 10-K. This Item generally discusses 2024 and 2023 items and year-to-year comparisons between 2024 and 2023. Discussions of 2022 items and year-to-year comparisons between 2023 and 2022 are not included, and can be found in “Management’s Discussion and Analysis of Financial Condition and Results of Operations” in Part II, Item 7 of the Company’s Annual Report on Form 10-K for the fiscal year ended September 30, 2023.\nProduct, Service and Software Announcements\nThe Company announces new product, service and software offerings at various times during the year. Significant announcements during fiscal year 2024 included the following:\nFirst Quarter 2024:\n•MacBook Pro 14-in.;\n•MacBook Pro 16-in.; and\n•iMac.\nSecond Quarter 2024:\n•MacBook Air 13-in.; and\n•MacBook Air 15-in.\nThird Quarter 2024:\n•iPad Air;\n•iPad Pro;\n•iOS 18, macOS Sequoia, iPadOS 18, watchOS 11, visionOS 2 and tvOS 18, updates to the Company’s operating systems; and\n•Apple Intelligence™, a personal intelligence system that uses generative models.\nFourth Quarter 2024:\n•iPhone 16, iPhone 16 Plus, iPhone 16 Pro and iPhone 16 Pro Max;\n•Apple Watch Series 10; and\n•AirPods 4.\nFiscal Period\nThe Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. An additional week is included in the first fiscal quarter every five or six years to realign the Company’s fiscal quarters with calendar quarters, which occurred in the first quarter of 2023. The Company’s fiscal years 2024 and 2022 spanned 52 weeks each, whereas fiscal year 2023 spanned 53 weeks.\nMacroeconomic Conditions\nMacroeconomic conditions, including inflation, interest rates and currency fluctuations, have directly and indirectly impacted, and could in the future materially impact, the Company’s results of operations and financial condition.\nApple Inc. | 2024 Form 10-K | 21\n\nSegment Operating Performance\nThe following table shows net sales by reportable segment for 2024, 2023 and 2022 (dollars in millions):\n2024\t\tChange\t\t2023\t\tChange\t\t2022\nAmericas\t$\t167,045 \t\t\t3 \t%\t\t$\t162,560 \t\t\t(4)\t%\t\t$\t169,658 \t\nEurope\t101,328 \t\t\t7 \t%\t\t94,294 \t\t\t(1)\t%\t\t95,118 \t\nGreater China\t66,952 \t\t\t(8)\t%\t\t72,559 \t\t\t(2)\t%\t\t74,200 \t\nJapan\t25,052 \t\t\t3 \t%\t\t24,257 \t\t\t(7)\t%\t\t25,977 \t\nRest of Asia Pacific\t30,658 \t\t\t4 \t%\t\t29,615 \t\t\t1 \t%\t\t29,375 \t\nTotal net sales\t$\t391,035 \t\t\t2 \t%\t\t$\t383,285 \t\t\t(3)\t%\t\t$\t394,328 \t\n \nAmericas\nAmericas net sales increased during 2024 compared to 2023 due primarily to higher net sales of Services.\nEurope\nEurope net sales increased during 2024 compared to 2023 due primarily to higher net sales of Services and iPhone.\nGreater China\nGreater China net sales decreased during 2024 compared to 2023 due primarily to lower net sales of iPhone and iPad. The weakness in the renminbi relative to the U.S. dollar had an unfavorable year-over-year impact on Greater China net sales during 2024.\nJapan\nJapan net sales increased during 2024 compared to 2023 due primarily to higher net sales of iPhone. The weakness in the yen relative to the U.S. dollar had an unfavorable year-over-year impact on Japan net sales during 2024.\nRest of Asia Pacific\nRest of Asia Pacific net sales increased during 2024 compared to 2023 due primarily to higher net sales of Services. The weakness in foreign currencies relative to the U.S. dollar had a net unfavorable year-over-year impact on Rest of Asia Pacific net sales during 2024.\nApple Inc. | 2024 Form 10-K | 22\n\nProducts and Services Performance\nThe following table shows net sales by category for 2024, 2023 and 2022 (dollars in millions):\n2024\t\tChange\t\t2023\t\tChange\t\t2022\niPhone\t$\t201,183 \t\t\t— \t%\t\t$\t200,583 \t\t\t(2)\t%\t\t$\t205,489 \t\nMac\t29,984 \t\t\t2 \t%\t\t29,357 \t\t\t(27)\t%\t\t40,177 \t\niPad\t26,694 \t\t\t(6)\t%\t\t28,300 \t\t\t(3)\t%\t\t29,292 \t\nWearables, Home and Accessories\t37,005 \t\t\t(7)\t%\t\t39,845 \t\t\t(3)\t%\t\t41,241 \t\nServices (1)\n96,169 \t\t\t13 \t%\t\t85,200 \t\t\t9 \t%\t\t78,129 \t\nTotal net sales\t$\t391,035 \t\t\t2 \t%\t\t$\t383,285 \t\t\t(3)\t%\t\t$\t394,328 \t\n \n(1)Services net sales include amortization of the deferred value of services bundled in the sales price of certain products.\niPhone\niPhone net sales were relatively flat during 2024 compared to 2023.\nMac\nMac net sales increased during 2024 compared to 2023 due primarily to higher net sales of laptops.\niPad\niPad net sales decreased during 2024 compared to 2023 due primarily to lower net sales of iPad Pro and the entry-level iPad models, partially offset by higher net sales of iPad Air.\nWearables, Home and Accessories\nWearables, Home and Accessories net sales decreased during 2024 compared to 2023 due primarily to lower net sales of Wearables and Accessories.\nServices\nServices net sales increased during 2024 compared to 2023 due primarily to higher net sales from advertising, the App Store® and cloud services.\nApple Inc. | 2024 Form 10-K | 23\n\nGross Margin\nProducts and Services gross margin and gross margin percentage for 2024, 2023 and 2022 were as follows (dollars in millions):\n2024\t\t2023\t\t2022\nGross margin:\t\t\t\t\t\nProducts\t$\t109,633 \t\t\t$\t108,803 \t\t\t$\t114,728 \t\nServices\t71,050 \t\t\t60,345 \t\t\t56,054 \t\nTotal gross margin\t$\t180,683 \t\t\t$\t169,148 \t\t\t$\t170,782 \t\n \nGross margin percentage:\t\t\t\t\t\nProducts\t37.2 \t%\t\t36.5 \t%\t\t36.3 \t%\nServices\t73.9 \t%\t\t70.8 \t%\t\t71.7 \t%\nTotal gross margin percentage\t46.2 \t%\t\t44.1 \t%\t\t43.3 \t%\n \nProducts Gross Margin\nProducts gross margin and Products gross margin percentage increased during 2024 compared to 2023 due to cost savings, partially offset by a different Products mix and the weakness in foreign currencies relative to the U.S. dollar.\nServices Gross Margin\nServices gross margin increased during 2024 compared to 2023 due primarily to higher Services net sales.\nServices gross margin percentage increased during 2024 compared to 2023 due to a different Services mix.\nThe Company’s future gross margins can be impacted by a variety of factors, as discussed in Part I, Item 1A of this Form 10-K under the heading “Risk Factors.” As a result, the Company believes, in general, gross margins will be subject to volatility and downward pressure.\nOperating Expenses\nOperating expenses for 2024, 2023 and 2022 were as follows (dollars in millions):\n2024\t\tChange\t\t2023\t\tChange\t\t2022\nResearch and development\t$\t31,370 \t\t\t5 \t%\t\t$\t29,915 \t\t\t14 \t%\t\t$\t26,251 \t\nPercentage of total net sales\t8 \t%\t\t\t\t8 \t%\t\t\t\t7 \t%\nSelling, general and administrative\t$\t26,097 \t\t\t5 \t%\t\t$\t24,932 \t\t\t(1)\t%\t\t$\t25,094 \t\nPercentage of total net sales\t7 \t%\t\t\t\t7 \t%\t\t\t\t6 \t%\nTotal operating expenses\t$\t57,467 \t\t\t5 \t%\t\t$\t54,847 \t\t\t7 \t%\t\t$\t51,345 \t\nPercentage of total net sales\t15 \t%\t\t\t\t14 \t%\t\t\t\t13 \t%\n \nResearch and Development\nThe growth in R&D expense during 2024 compared to 2023 was driven primarily by increases in headcount-related expenses.\nSelling, General and Administrative\nSelling, general and administrative expense increased $1.2 billion during 2024 compared to 2023.\nApple Inc. | 2024 Form 10-K | 24\n\nProvision for Income Taxes\nProvision for income taxes, effective tax rate and statutory federal income tax rate for 2024, 2023 and 2022 were as follows (dollars in millions):\n2024\t\t2023\t\t2022\nProvision for income taxes\t$\t29,749 \t\t\t$\t16,741 \t\t\t$\t19,300 \t\nEffective tax rate\t24.1 \t%\t\t14.7 \t%\t\t16.2 \t%\nStatutory federal income tax rate\t21 \t%\t\t21 \t%\t\t21 \t%\n \nThe Company’s effective tax rate for 2024 was higher than the statutory federal income tax rate due primarily to a one-time income tax charge of $10.2 billion, net, related to the State Aid Decision (refer to Note 7, “Income Taxes” in the Notes to Consolidated Financial Statements in Part II, Item 8 of this Form 10-K) and state income taxes, partially offset by a lower effective tax rate on foreign earnings, the impact of the U.S. federal R&D credit, and tax benefits from share-based compensation.\nThe Company’s effective tax rate for 2024 was higher compared to 2023 due primarily to a one-time income tax charge of $10.2 billion, net, related to the State Aid Decision, a higher effective tax rate on foreign earnings and lower tax benefits from share-based compensation.\nLiquidity and Capital Resources\nThe Company believes its balances of unrestricted cash, cash equivalents and marketable securities, which totaled $140.8 billion as of September 28, 2024, along with cash generated by ongoing operations and continued access to debt markets, will be sufficient to satisfy its cash requirements and capital return program over the next 12 months and beyond.\nThe Company’s material cash requirements include the following contractual obligations:\nDebt\nAs of September 28, 2024, the Company had outstanding fixed-rate notes with varying maturities for an aggregate principal amount of $97.3 billion (collectively the “Notes”), with $10.9 billion payable within 12 months. Future interest payments associated with the Notes total $38.5 billion, with $2.6 billion payable within 12 months.\nThe Company also issues unsecured short-term promissory notes pursuant to a commercial paper program. As of September 28, 2024, the Company had $10.0 billion of commercial paper outstanding, all of which was payable within 12 months.\nLeases\nThe Company has lease arrangements for certain equipment and facilities, including corporate, data center, manufacturing and retail space. As of September 28, 2024, the Company had fixed lease payment obligations of $15.6 billion, with $2.0 billion payable within 12 months.\nManufacturing Purchase Obligations\nThe Company utilizes several outsourcing partners to manufacture subassemblies for the Company’s products and to perform final assembly and testing of finished products. The Company also obtains individual components for its products from a wide variety of individual suppliers. As of September 28, 2024, the Company had manufacturing purchase obligations of $53.0 billion, with $52.9 billion payable within 12 months.\nOther Purchase Obligations\nThe Company’s other purchase obligations primarily consist of noncancelable obligations to acquire capital assets, including assets related to product manufacturing, and noncancelable obligations related to supplier arrangements, licensed intellectual property and content, and distribution rights. As of September 28, 2024, the Company had other purchase obligations of $12.0 billion, with $4.1 billion payable within 12 months.\nDeemed Repatriation Tax Payable\nAs of September 28, 2024, the balance of the deemed repatriation tax payable imposed by the U.S. Tax Cuts and Jobs Act of 2017 (the “TCJA”) was $16.5 billion, with $7.2 billion expected to be paid within 12 months.\nApple Inc. | 2024 Form 10-K | 25\n\nState Aid Decision Tax Payable\nAs of September 28, 2024, the Company had an obligation to pay €14.2 billion or $15.8 billion to Ireland in connection with the State Aid Decision, all of which was expected to be paid within 12 months. The funds necessary to settle the obligation were held in escrow as of September 28, 2024, and restricted from general use.\nCapital Return Program\nIn addition to its contractual cash requirements, the Company has an authorized share repurchase program. The program does not obligate the Company to acquire a minimum amount of shares. As of September 28, 2024, the Company’s quarterly cash dividend was $0.25 per share. The Company intends to increase its dividend on an annual basis, subject to declaration by the Board.\nIn May 2024, the Company announced a new share repurchase program of up to $110 billion and raised its quarterly dividend from $0.24 to $0.25 per share beginning in May 2024. During 2024, the Company repurchased $95.0 billion of its common stock and paid dividends and dividend equivalents of $15.2 billion.\nRecent Accounting Pronouncements\nIncome Taxes\nIn December 2023, the Financial Accounting Standards Board (the “FASB”) issued Accounting Standards Update (“ASU”) No. 2023-09, Income Taxes (Topic 740): Improvements to Income Tax Disclosures (“ASU 2023-09”), which will require the Company to disclose specified additional information in its income tax rate reconciliation and provide additional information for reconciling items that meet a quantitative threshold. ASU 2023-09 will also require the Company to disaggregate its income taxes paid disclosure by federal, state and foreign taxes, with further disaggregation required for significant individual jurisdictions. The Company will adopt ASU 2023-09 in its fourth quarter of 2026 using a prospective transition method.\nSegment Reporting\nIn November 2023, the FASB issued ASU No. 2023-07, Segment Reporting (Topic 280): Improvements to Reportable Segment Disclosures (“ASU 2023-07”), which will require the Company to disclose segment expenses that are significant and regularly provided to the Company’s chief operating decision maker (“CODM”). In addition, ASU 2023-07 will require the Company to disclose the title and position of its CODM and how the CODM uses segment profit or loss information in assessing segment performance and deciding how to allocate resources. The Company will adopt ASU 2023-07 in its fourth quarter of 2025 using a retrospective transition method.\nCritical Accounting Estimates\nThe preparation of financial statements and related disclosures in conformity with U.S. generally accepted accounting principles (“GAAP”) and the Company’s discussion and analysis of its financial condition and operating results require the Company’s management to make judgments, assumptions and estimates that affect the amounts reported. Note 1, “Summary of Significant Accounting Policies” of the Notes to Consolidated Financial Statements in Part II, Item 8 of this Form 10-K describes the significant accounting policies and methods used in the preparation of the Company’s consolidated financial statements. Management bases its estimates on historical experience and on various other assumptions it believes to be reasonable under the circumstances, the results of which form the basis for making judgments about the carrying values of assets and liabilities.\nUncertain Tax Positions\nThe Company is subject to income taxes in the U.S. and numerous foreign jurisdictions. The evaluation of the Company’s uncertain tax positions involves significant judgment in the interpretation and application of GAAP and complex domestic and international tax laws, including the TCJA and the allocation of international taxation rights between countries. Although management believes the Company’s reserves are reasonable, no assurance can be given that the final outcome of these uncertainties will not be different from that reflected in the Company’s reserves. Reserves are adjusted considering changing facts and circumstances, such as the closing of a tax examination. Resolution of these uncertainties in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results.\nLegal and Other Contingencies\nThe Company is subject to various legal proceedings and claims that arise in the ordinary course of business, the outcomes of which are inherently uncertain. The Company records a liability when it is probable that a loss has been incurred and the amount is reasonably estimable, the determination of which requires significant judgment. Resolution of legal matters in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results.\nApple Inc. | 2024 Form 10-K | 26\n\nItem 7A. Quantitative and Qualitative Disclosures About Market Risk\nThe Company is exposed to economic risk from interest rates and foreign exchange rates. The Company uses various strategies to manage these risks; however, they may still impact the Company’s consolidated financial statements.\nInterest Rate Risk\nThe Company is primarily exposed to fluctuations in U.S. interest rates and their impact on the Company’s investment portfolio and term debt. Increases in interest rates will negatively affect the fair value of the Company’s investment portfolio and increase the interest expense on the Company’s term debt. To protect against interest rate risk, the Company may use derivative instruments, offset interest rate–sensitive assets and liabilities, or control duration of the investment and term debt portfolios.\nThe following table sets forth potential impacts on the Company’s investment portfolio and term debt, including the effects of any associated derivatives, that would result from a hypothetical increase in relevant interest rates as of September 28, 2024 and September 30, 2023 (dollars in millions):\nInterest Rate\nSensitive Instrument\nHypothetical Interest\nRate Increase\nPotential Impact\n2024\t\t2023\nInvestment portfolio\n100 basis points, all tenors\nDecline in fair value\n$\t2,755 \t\t\t$\t3,089 \t\nTerm debt\n100 basis points, all tenors\nIncrease in annual interest expense\n$\t139 \t\t\t$\t194 \t\n \nForeign Exchange Rate Risk\nThe Company’s exposure to foreign exchange rate risk relates primarily to the Company being a net receiver of currencies other than the U.S. dollar. Changes in exchange rates, and in particular a strengthening of the U.S. dollar, will negatively affect the Company’s net sales and gross margins as expressed in U.S. dollars. Fluctuations in exchange rates may also affect the fair values of certain of the Company’s assets and liabilities. To protect against foreign exchange rate risk, the Company may use derivative instruments, offset exposures, or adjust local currency pricing of its products and services. However, the Company may choose to not hedge certain foreign currency exposures for a variety of reasons, including accounting considerations or prohibitive cost.\nThe Company applied a value-at-risk (“VAR”) model to its foreign currency derivative positions to assess the potential impact of fluctuations in exchange rates. The VAR model used a Monte Carlo simulation. The VAR is the maximum expected loss in fair value, for a given confidence interval, to the Company’s foreign currency derivative positions due to adverse movements in rates. Based on the results of the model, the Company estimates, with 95% confidence, a maximum one-day loss in fair value of $538 million and $669 million as of September 28, 2024 and September 30, 2023, respectively. Changes in the Company’s underlying foreign currency exposures, which were excluded from the assessment, generally offset changes in the fair values of the Company’s foreign currency derivatives.\nApple Inc. | 2024 Form 10-K | 27\n\nItem 8. Financial Statements and Supplementary Data\nIndex to Consolidated Financial Statements\t\tPage\nConsolidated Statements of Operations for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n29\nConsolidated Statements of Comprehensive Income for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n30\nConsolidated Balance Sheets as of September 28, 2024 and September 30, 2023\n31\nConsolidated Statements of Shareholders’ Equity for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n32\nConsolidated Statements of Cash Flows for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n33\nNotes to Consolidated Financial Statements\n34\nReports of Independent Registered Public Accounting Firm\n48\n \nAll financial statement schedules have been omitted, since the required information is not applicable or is not present in amounts sufficient to require submission of the schedule, or because the information required is included in the consolidated financial statements and accompanying notes.\nApple Inc. | 2024 Form 10-K | 28\n\nApple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares, which are reflected in thousands, and per-share amounts)\n\nYears ended\nSeptember 28,\n2024\t\tSeptember 30,\n2023\t\tSeptember 24,\n2022\nNet sales:\t\t\t\t\t\n Products\t$\t294,866 \t\t\t$\t298,085 \t\t\t$\t316,199 \t\n Services\t96,169 \t\t\t85,200 \t\t\t78,129 \t\nTotal net sales\t391,035 \t\t\t383,285 \t\t\t394,328 \t\nCost of sales:\t\t\t\t\t\n Products\t185,233 \t\t\t189,282 \t\t\t201,471 \t\n Services\t25,119 \t\t\t24,855 \t\t\t22,075 \t\nTotal cost of sales\t210,352 \t\t\t214,137 \t\t\t223,546 \t\nGross margin\t180,683 \t\t\t169,148 \t\t\t170,782 \t\nOperating expenses:\t\t\t\t\t\nResearch and development\t31,370 \t\t\t29,915 \t\t\t26,251 \t\nSelling, general and administrative\t26,097 \t\t\t24,932 \t\t\t25,094 \t\nTotal operating expenses\t57,467 \t\t\t54,847 \t\t\t51,345 \t\nOperating income\t123,216 \t\t\t114,301 \t\t\t119,437 \t\nOther income/(expense), net\t269 \t\t\t(565)\t\t\t(334)\t\nIncome before provision for income taxes\t123,485 \t\t\t113,736 \t\t\t119,103 \t\nProvision for income taxes\t29,749 \t\t\t16,741 \t\t\t19,300 \t\nNet income\t$\t93,736 \t\t\t$\t96,995 \t\t\t$\t99,803 \t\nEarnings per share:\t\t\t\t\t\nBasic\t$\t6.11 \t\t\t$\t6.16 \t\t\t$\t6.15 \t\nDiluted\t$\t6.08 \t\t\t$\t6.13 \t\t\t$\t6.11 \t\nShares used in computing earnings per share:\t\t\t\t\t\nBasic\t15,343,783 \t\t\t15,744,231 \t\t\t16,215,963 \t\nDiluted\t15,408,095 \t\t\t15,812,547 \t\t\t16,325,819 \t\n \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2024 Form 10-K | 29\n\nApple Inc.\nCONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME\n(In millions)\n\nYears ended\nSeptember 28,\n2024\t\tSeptember 30,\n2023\t\tSeptember 24,\n2022\nNet income\t$\t93,736 \t\t\t$\t96,995 \t\t\t$\t99,803 \t\nOther comprehensive income/(loss):\t\t\t\t\t\nChange in foreign currency translation, net of tax\t395 \t\t\t(765)\t\t\t(1,511)\t\nChange in unrealized gains/losses on derivative instruments, net of tax:\t\t\t\t\t\nChange in fair value of derivative instruments\t(832)\t\t\t323 \t\t\t3,212 \t\nAdjustment for net (gains)/losses realized and included in net income\t(1,337)\t\t\t(1,717)\t\t\t(1,074)\t\nTotal change in unrealized gains/losses on derivative instruments\t(2,169)\t\t\t(1,394)\t\t\t2,138 \t\nChange in unrealized gains/losses on marketable debt securities, net of tax:\t\t\t\t\t\nChange in fair value of marketable debt securities\t5,850 \t\t\t1,563 \t\t\t(12,104)\t\nAdjustment for net (gains)/losses realized and included in net income\t204 \t\t\t253 \t\t\t205 \t\nTotal change in unrealized gains/losses on marketable debt securities\t6,054 \t\t\t1,816 \t\t\t(11,899)\t\nTotal other comprehensive income/(loss)\t4,280 \t\t\t(343)\t\t\t(11,272)\t\nTotal comprehensive income\t$\t98,016 \t\t\t$\t96,652 \t\t\t$\t88,531 \t\n \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2024 Form 10-K | 30\n\nApple Inc.\nCONSOLIDATED BALANCE SHEETS\n(In millions, except number of shares, which are reflected in thousands, and par value)\n\nSeptember 28,\n2024\t\tSeptember 30,\n2023\nASSETS:\nCurrent assets:\t\t\t\nCash and cash equivalents\t$\t29,943 \t\t\t$\t29,965 \t\nMarketable securities\t35,228 \t\t\t31,590 \t\nAccounts receivable, net\t33,410 \t\t\t29,508 \t\nVendor non-trade receivables\t32,833 \t\t\t31,477 \t\nInventories\t7,286 \t\t\t6,331 \t\nOther current assets\t14,287 \t\t\t14,695 \t\nTotal current assets\t152,987 \t\t\t143,566 \t\nNon-current assets:\t\t\t\nMarketable securities\t91,479 \t\t\t100,544 \t\nProperty, plant and equipment, net\t45,680 \t\t\t43,715 \t\nOther non-current assets\t74,834 \t\t\t64,758 \t\nTotal non-current assets\t211,993 \t\t\t209,017 \t\nTotal assets\t$\t364,980 \t\t\t$\t352,583 \t\nLIABILITIES AND SHAREHOLDERS’ EQUITY:\nCurrent liabilities:\t\t\t\nAccounts payable\t$\t68,960 \t\t\t$\t62,611 \t\nOther current liabilities\t78,304 \t\t\t58,829 \t\nDeferred revenue\t8,249 \t\t\t8,061 \t\nCommercial paper\t9,967 \t\t\t5,985 \t\nTerm debt\t10,912 \t\t\t9,822 \t\nTotal current liabilities\t176,392 \t\t\t145,308 \t\nNon-current liabilities:\t\t\t\nTerm debt\t85,750 \t\t\t95,281 \t\nOther non-current liabilities\t45,888 \t\t\t49,848 \t\nTotal non-current liabilities\t131,638 \t\t\t145,129 \t\nTotal liabilities\t308,030 \t\t\t290,437 \t\nCommitments and contingencies\t\t\t\nShareholders’ equity:\t\t\t\nCommon stock and additional paid-in capital, $0.00001 par value: 50,400,000 shares authorized; 15,116,786 and 15,550,061 shares issued and outstanding, respectively\n83,276 \t\t\t73,812 \t\nAccumulated deficit\t(19,154)\t\t\t(214)\t\nAccumulated other comprehensive loss\t(7,172)\t\t\t(11,452)\t\nTotal shareholders’ equity\t56,950 \t\t\t62,146 \t\nTotal liabilities and shareholders’ equity\t$\t364,980 \t\t\t$\t352,583 \t\n \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2024 Form 10-K | 31\n\nApple Inc.\nCONSOLIDATED STATEMENTS OF SHAREHOLDERS’ EQUITY\n(In millions, except per-share amounts)\n\nYears ended\nSeptember 28,\n2024\t\tSeptember 30,\n2023\t\tSeptember 24,\n2022\nTotal shareholders’ equity, beginning balances\t$\t62,146 \t\t\t$\t50,672 \t\t\t$\t63,090 \t\nCommon stock and additional paid-in capital:\t\t\t\t\t\nBeginning balances\t73,812 \t\t\t64,849 \t\t\t57,365 \t\nCommon stock issued\t1,423 \t\t\t1,346 \t\t\t1,175 \t\nCommon stock withheld related to net share settlement of equity awards\t(3,993)\t\t\t(3,521)\t\t\t(2,971)\t\nShare-based compensation\t12,034 \t\t\t11,138 \t\t\t9,280 \t\nEnding balances\t83,276 \t\t\t73,812 \t\t\t64,849 \t\nRetained earnings/(Accumulated deficit):\t\t\t\t\t\nBeginning balances\t(214)\t\t\t(3,068)\t\t\t5,562 \t\nNet income\t93,736 \t\t\t96,995 \t\t\t99,803 \t\nDividends and dividend equivalents declared\t(15,218)\t\t\t(14,996)\t\t\t(14,793)\t\nCommon stock withheld related to net share settlement of equity awards\t(1,612)\t\t\t(2,099)\t\t\t(3,454)\t\nCommon stock repurchased\t(95,846)\t\t\t(77,046)\t\t\t(90,186)\t\nEnding balances\t(19,154)\t\t\t(214)\t\t\t(3,068)\t\nAccumulated other comprehensive income/(loss):\t\t\t\t\t\nBeginning balances\t(11,452)\t\t\t(11,109)\t\t\t163 \t\nOther comprehensive income/(loss)\t4,280 \t\t\t(343)\t\t\t(11,272)\t\nEnding balances\t(7,172)\t\t\t(11,452)\t\t\t(11,109)\t\nTotal shareholders’ equity, ending balances\t$\t56,950 \t\t\t$\t62,146 \t\t\t$\t50,672 \t\nDividends and dividend equivalents declared per share or RSU\t$\t0.98 \t\t\t$\t0.94 \t\t\t$\t0.90 \t\n \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2024 Form 10-K | 32\n\nApple Inc.\nCONSOLIDATED STATEMENTS OF CASH FLOWS\n(In millions)\nYears ended\nSeptember 28,\n2024\t\tSeptember 30,\n2023\t\tSeptember 24,\n2022\nCash, cash equivalents, and restricted cash and cash equivalents, beginning balances\n$\t30,737 \t\t\t$\t24,977 \t\t\t$\t35,929 \t\nOperating activities:\t\t\t\t\t\nNet income\t93,736 \t\t\t96,995 \t\t\t99,803 \t\nAdjustments to reconcile net income to cash generated by operating activities:\t\t\t\t\t\nDepreciation and amortization\t11,445 \t\t\t11,519 \t\t\t11,104 \t\nShare-based compensation expense\t11,688 \t\t\t10,833 \t\t\t9,038 \t\nOther\t(2,266)\t\t\t(2,227)\t\t\t1,006 \t\nChanges in operating assets and liabilities:\t\t\t\t\t\nAccounts receivable, net\t(3,788)\t\t\t(1,688)\t\t\t(1,823)\t\nVendor non-trade receivables\t(1,356)\t\t\t1,271 \t\t\t(7,520)\t\nInventories\t(1,046)\t\t\t(1,618)\t\t\t1,484 \t\nOther current and non-current assets\t(11,731)\t\t\t(5,684)\t\t\t(6,499)\t\nAccounts payable\t6,020 \t\t\t(1,889)\t\t\t9,448 \t\nOther current and non-current liabilities\t15,552 \t\t\t3,031 \t\t\t6,110 \t\nCash generated by operating activities\t118,254 \t\t\t110,543 \t\t\t122,151 \t\nInvesting activities:\t\t\t\t\t\nPurchases of marketable securities\t(48,656)\t\t\t(29,513)\t\t\t(76,923)\t\nProceeds from maturities of marketable securities\t51,211 \t\t\t39,686 \t\t\t29,917 \t\nProceeds from sales of marketable securities\t11,135 \t\t\t5,828 \t\t\t37,446 \t\nPayments for acquisition of property, plant and equipment\t(9,447)\t\t\t(10,959)\t\t\t(10,708)\t\nOther\t(1,308)\t\t\t(1,337)\t\t\t(2,086)\t\nCash generated by/(used in) investing activities\t2,935 \t\t\t3,705 \t\t\t(22,354)\t\nFinancing activities:\t\t\t\t\t\nPayments for taxes related to net share settlement of equity awards\t(5,441)\t\t\t(5,431)\t\t\t(6,223)\t\nPayments for dividends and dividend equivalents\t(15,234)\t\t\t(15,025)\t\t\t(14,841)\t\nRepurchases of common stock\t(94,949)\t\t\t(77,550)\t\t\t(89,402)\t\nProceeds from issuance of term debt, net\t— \t\t\t5,228 \t\t\t5,465 \t\nRepayments of term debt\t(9,958)\t\t\t(11,151)\t\t\t(9,543)\t\nProceeds from/(Repayments of) commercial paper, net\t3,960 \t\t\t(3,978)\t\t\t3,955 \t\nOther\t(361)\t\t\t(581)\t\t\t(160)\t\nCash used in financing activities\t(121,983)\t\t\t(108,488)\t\t\t(110,749)\t\nIncrease/(Decrease) in cash, cash equivalents, and restricted cash and cash equivalents\t(794)\t\t\t5,760 \t\t\t(10,952)\t\nCash, cash equivalents, and restricted cash and cash equivalents, ending balances\n$\t29,943 \t\t\t$\t30,737 \t\t\t$\t24,977 \t\nSupplemental cash flow disclosure:\t\t\t\t\t\nCash paid for income taxes, net\t$\t26,102 \t\t\t$\t18,679 \t\t\t$\t19,573 \t\n \nSee accompanying Notes to Consolidated Financial Statements.\nApple Inc. | 2024 Form 10-K | 33\n\nApple Inc.\nNotes to Consolidated Financial Statements\nNote 1 – Summary of Significant Accounting Policies\nBasis of Presentation and Preparation\nThe consolidated financial statements include the accounts of Apple Inc. and its wholly owned subsidiaries. The preparation of these consolidated financial statements and accompanying notes in conformity with GAAP requires the use of management estimates. Certain prior period amounts in the consolidated financial statements and accompanying notes have been reclassified to conform to the current period’s presentation.\nThe Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. An additional week is included in the first fiscal quarter every five or six years to realign the Company’s fiscal quarters with calendar quarters, which occurred in the first fiscal quarter of 2023. The Company’s fiscal years 2024 and 2022 spanned 52 weeks each, whereas fiscal year 2023 spanned 53 weeks. Unless otherwise stated, references to particular years, quarters, months and periods refer to the Company’s fiscal years ended in September and the associated quarters, months and periods of those fiscal years.\nRevenue\nThe Company records revenue net of taxes collected from customers that are remitted to governmental authorities.\nShare-Based Compensation\nThe Company recognizes share-based compensation expense on a straight-line basis for its estimate of equity awards that will ultimately vest.\nCash Equivalents\nAll highly liquid investments with maturities of three months or less at the date of purchase are treated as cash equivalents.\nMarketable Securities\nThe cost of securities sold is determined using the specific identification method.\nInventories\nInventories are measured using the first-in, first-out method.\nProperty, Plant and Equipment\nDepreciation on property, plant and equipment is recognized on a straight-line basis.\nDerivative Instruments\nThe Company presents derivative assets and liabilities at their gross fair values in the Consolidated Balance Sheets.\nIncome Taxes\nThe Company records certain deferred tax assets and liabilities in connection with the minimum tax on certain foreign earnings created by the TCJA.\nLeases\nThe Company combines and accounts for lease and nonlease components as a single lease component for leases of corporate, data center and retail facilities.\nApple Inc. | 2024 Form 10-K | 34\n\nNote 2 – Revenue\nThe Company recognizes revenue at the amount to which it expects to be entitled when control of the products or services is transferred to its customers. Control is generally transferred when the Company has a present right to payment and title and the significant risks and rewards of ownership of products or services are transferred to its customers. For most of the Company’s Products net sales, control transfers when products are shipped. For the Company’s Services net sales, control transfers over time as services are delivered. Payment for Products and Services net sales is collected within a short period following transfer of control or commencement of delivery of services, as applicable.\nThe Company records reductions to Products net sales related to future product returns, price protection and other customer incentive programs based on the Company’s expectations and historical experience.\nFor arrangements with multiple performance obligations, which represent promises within an arrangement that are distinct, the Company allocates revenue to all distinct performance obligations based on their relative stand-alone selling prices (“SSPs”). When available, the Company uses observable prices to determine SSPs. When observable prices are not available, SSPs are established that reflect the Company’s best estimates of what the selling prices of the performance obligations would be if they were sold regularly on a stand-alone basis. The Company’s process for estimating SSPs without observable prices considers multiple factors that may vary depending upon the unique facts and circumstances related to each performance obligation including, where applicable, prices charged by the Company for similar offerings, market trends in the pricing for similar offerings, product-specific business objectives and the estimated cost to provide the performance obligation.\nThe Company has identified the performance obligations regularly included in arrangements involving the sale of iPhone, Mac and iPad. The first material performance obligation, which represents the substantial portion of the allocated sales price, is the hardware and bundled software delivered at the time of sale. The second material performance obligation is the right to receive certain product-related bundled services, which include iCloud®, Siri® and Maps. The Company allocates revenue and any related discounts to all of its performance obligations based on their relative SSPs. Because the Company lacks observable prices for product-related bundled services, the allocation of revenue is based on the Company’s estimated SSPs. Revenue allocated to the delivered hardware and bundled software is recognized when control has transferred to the customer, which generally occurs when the product is shipped. Revenue allocated to product-related bundled services is deferred and recognized on a straight-line basis over the estimated period they are expected to be provided.\nFor certain long-term service arrangements, the Company has performance obligations for services it has not yet delivered. For these arrangements, the Company does not have a right to bill for the undelivered services. The Company has determined that any unbilled consideration relates entirely to the value of the undelivered services. Accordingly, the Company has not recognized revenue, and does not disclose amounts, related to these undelivered services.\nFor the sale of third-party products where the Company obtains control of the product before transferring it to the customer, the Company recognizes revenue based on the gross amount billed to customers. The Company considers multiple factors when determining whether it obtains control of third-party products, including evaluating if it can establish the price of the product, retains inventory risk for tangible products or has the responsibility for ensuring acceptability of the product. For third-party applications sold through the App Store, the Company does not obtain control of the product before transferring it to the customer. Therefore, the Company accounts for all third-party application–related sales on a net basis by recognizing in Services net sales only the commission it retains.\nNet sales disaggregated by significant products and services for 2024, 2023 and 2022 were as follows (in millions):\n2024\t\t2023\t\t2022\niPhone\n$\t201,183 \t\t\t$\t200,583 \t\t\t$\t205,489 \t\nMac\n29,984 \t\t\t29,357 \t\t\t40,177 \t\niPad\n26,694 \t\t\t28,300 \t\t\t29,292 \t\nWearables, Home and Accessories\n37,005 \t\t\t39,845 \t\t\t41,241 \t\nServices (1)\n96,169 \t\t\t85,200 \t\t\t78,129 \t\nTotal net sales\t$\t391,035 \t\t\t$\t383,285 \t\t\t$\t394,328 \t\n \n(1)Services net sales include amortization of the deferred value of services bundled in the sales price of certain products.\nTotal net sales include $7.7 billion of revenue recognized in 2024 that was included in deferred revenue as of September 30, 2023, $8.2 billion of revenue recognized in 2023 that was included in deferred revenue as of September 24, 2022, and $7.5 billion of revenue recognized in 2022 that was included in deferred revenue as of September 25, 2021.\nApple Inc. | 2024 Form 10-K | 35\n\nThe Company’s proportion of net sales by disaggregated revenue source was generally consistent for each reportable segment in Note 13, “Segment Information and Geographic Data” for 2024, 2023 and 2022, except in Greater China, where iPhone revenue represented a moderately higher proportion of net sales.\nAs of September 28, 2024 and September 30, 2023, the Company had total deferred revenue of $12.8 billion and $12.1 billion, respectively. As of September 28, 2024, the Company expects 64% of total deferred revenue to be realized in less than a year, 25% within one-to-two years, 9% within two-to-three years and 2% in greater than three years.\nNote 3 – Earnings Per Share\nThe following table shows the computation of basic and diluted earnings per share for 2024, 2023 and 2022 (net income in millions and shares in thousands):\n2024\t\t2023\t\t2022\nNumerator:\t\t\t\t\t\nNet income\t$\t93,736 \t\t\t$\t96,995 \t\t\t$\t99,803 \t\nDenominator:\t\t\t\t\t\nWeighted-average basic shares outstanding\t15,343,783 \t\t\t15,744,231 \t\t\t16,215,963 \t\nEffect of dilutive share-based awards\t64,312 \t\t\t68,316 \t\t\t109,856 \t\nWeighted-average diluted shares\t15,408,095 \t\t\t15,812,547 \t\t\t16,325,819 \t\nBasic earnings per share\t$\t6.11 \t\t\t$\t6.16 \t\t\t$\t6.15 \t\nDiluted earnings per share\t$\t6.08 \t\t\t$\t6.13 \t\t\t$\t6.11 \t\n \nApproximately 24 million restricted stock units (“RSUs”) were excluded from the computation of diluted earnings per share for 2023 because their effect would have been antidilutive.\nNote 4 – Financial Instruments\nCash, Cash Equivalents and Marketable Securities\nThe following tables show the Company’s cash, cash equivalents and marketable securities by significant investment category as of September 28, 2024 and September 30, 2023 (in millions):\n2024\nAdjusted\nCost\t\tUnrealized\nGains\t\tUnrealized\nLosses\t\tFair\nValue\t\tCash and\nCash\nEquivalents\t\tCurrent\nMarketable\nSecurities\t\tNon-Current\nMarketable\nSecurities\nCash\t$\t27,199 \t\t\t$\t— \t\t\t$\t— \t\t\t$\t27,199 \t\t\t$\t27,199 \t\t\t$\t— \t\t\t$\t— \t\nLevel 1:\t\t\t\t\t\t\t\t\t\t\t\t\t\nMoney market funds\t778 \t\t\t— \t\t\t— \t\t\t778 \t\t\t778 \t\t\t— \t\t\t— \t\nMutual funds\n515 \t\t\t105 \t\t\t(3)\t\t\t617 \t\t\t— \t\t\t617 \t\t\t— \t\nSubtotal\t1,293 \t\t\t105 \t\t\t(3)\t\t\t1,395 \t\t\t778 \t\t\t617 \t\t\t— \t\nLevel 2 (1):\nU.S. Treasury securities\t16,150 \t\t\t45 \t\t\t(516)\t\t\t15,679 \t\t\t212 \t\t\t4,087 \t\t\t11,380 \t\nU.S. agency securities\t5,431 \t\t\t— \t\t\t(272)\t\t\t5,159 \t\t\t155 \t\t\t703 \t\t\t4,301 \t\nNon-U.S. government securities\t17,959 \t\t\t93 \t\t\t(484)\t\t\t17,568 \t\t\t1,158 \t\t\t10,810 \t\t\t5,600 \t\nCertificates of deposit and time deposits\t873 \t\t\t— \t\t\t— \t\t\t873 \t\t\t387 \t\t\t478 \t\t\t8 \t\nCommercial paper\t1,066 \t\t\t— \t\t\t— \t\t\t1,066 \t\t\t28 \t\t\t1,038 \t\t\t— \t\nCorporate debt securities\t65,622 \t\t\t270 \t\t\t(1,953)\t\t\t63,939 \t\t\t26 \t\t\t16,027 \t\t\t47,886 \t\nMunicipal securities\t412 \t\t\t— \t\t\t(7)\t\t\t405 \t\t\t— \t\t\t190 \t\t\t215 \t\nMortgage- and asset-backed securities\t24,595 \t\t\t175 \t\t\t(1,403)\t\t\t23,367 \t\t\t— \t\t\t1,278 \t\t\t22,089 \t\nSubtotal\t132,108 \t\t\t583 \t\t\t(4,635)\t\t\t128,056 \t\t\t1,966 \t\t\t34,611 \t\t\t91,479 \t\nTotal (2)(3)\n$\t160,600 \t\t\t$\t688 \t\t\t$\t(4,638)\t\t\t$\t156,650 \t\t\t$\t29,943 \t\t\t$\t35,228 \t\t\t$\t91,479 \t\n \nApple Inc. | 2024 Form 10-K | 36\n\n2023\nAdjusted\nCost\t\tUnrealized\nGains\t\tUnrealized\nLosses\t\tFair\nValue\t\tCash and\nCash\nEquivalents\t\tCurrent\nMarketable\nSecurities\t\tNon-Current\nMarketable\nSecurities\nCash\t$\t28,359 \t\t\t$\t— \t\t\t$\t— \t\t\t$\t28,359 \t\t\t$\t28,359 \t\t\t$\t— \t\t\t$\t— \t\nLevel 1:\t\t\t\t\t\t\t\t\t\t\t\t\t\nMoney market funds\t481 \t\t\t— \t\t\t— \t\t\t481 \t\t\t481 \t\t\t— \t\t\t— \t\nMutual funds and equity securities\n442 \t\t\t12 \t\t\t(26)\t\t\t428 \t\t\t— \t\t\t428 \t\t\t— \t\nSubtotal\t923 \t\t\t12 \t\t\t(26)\t\t\t909 \t\t\t481 \t\t\t428 \t\t\t— \t\nLevel 2 (1):\nU.S. Treasury securities\t19,406 \t\t\t— \t\t\t(1,292)\t\t\t18,114 \t\t\t35 \t\t\t5,468 \t\t\t12,611 \t\nU.S. agency securities\t5,736 \t\t\t— \t\t\t(600)\t\t\t5,136 \t\t\t36 \t\t\t271 \t\t\t4,829 \t\nNon-U.S. government securities\t17,533 \t\t\t6 \t\t\t(1,048)\t\t\t16,491 \t\t\t— \t\t\t11,332 \t\t\t5,159 \t\nCertificates of deposit and time deposits\t1,354 \t\t\t— \t\t\t— \t\t\t1,354 \t\t\t1,034 \t\t\t320 \t\t\t— \t\nCommercial paper\t608 \t\t\t— \t\t\t— \t\t\t608 \t\t\t— \t\t\t608 \t\t\t— \t\nCorporate debt securities\t76,840 \t\t\t6 \t\t\t(5,956)\t\t\t70,890 \t\t\t20 \t\t\t12,627 \t\t\t58,243 \t\nMunicipal securities\t628 \t\t\t— \t\t\t(26)\t\t\t602 \t\t\t— \t\t\t192 \t\t\t410 \t\nMortgage- and asset-backed securities\t22,365 \t\t\t6 \t\t\t(2,735)\t\t\t19,636 \t\t\t— \t\t\t344 \t\t\t19,292 \t\nSubtotal\t144,470 \t\t\t18 \t\t\t(11,657)\t\t\t132,831 \t\t\t1,125 \t\t\t31,162 \t\t\t100,544 \t\nTotal (3)\n$\t173,752 \t\t\t$\t30 \t\t\t$\t(11,683)\t\t\t$\t162,099 \t\t\t$\t29,965 \t\t\t$\t31,590 \t\t\t$\t100,544 \t\n \n(1)The valuation techniques used to measure the fair values of the Company’s Level 2 financial instruments, which generally have counterparties with high credit ratings, are based on quoted market prices or model-driven valuations using significant inputs derived from or corroborated by observable market data.\n(2)As of September 28, 2024, cash and cash equivalents included $2.6 billion held in escrow and restricted from general use. These restricted cash and cash equivalents were designated to settle the Company’s obligation related to the State Aid Decision (refer to Note 7, “Income Taxes”).\n(3)As of September 28, 2024 and September 30, 2023, total marketable securities included $13.2 billion and $13.8 billion, respectively, held in escrow and restricted from general use. The September 28, 2024 restricted marketable securities were designated to settle the Company’s obligation related to the State Aid Decision (refer to Note 7, “Income Taxes”).\nAs of September 28, 2024, 86% of the Company’s non-current marketable debt securities other than mortgage- and asset-backed securities had maturities between 1 and 5 years, 10% between 5 and 10 years, and 4% greater than 10 years. As of September 28, 2024, 14% of the Company’s non-current mortgage- and asset-backed securities had maturities between 1 and 5 years, 9% between 5 and 10 years, and 77% greater than 10 years.\nThe Company’s investments in marketable debt securities have been classified and accounted for as available-for-sale. The Company classifies marketable debt securities as either current or non-current based on each instrument’s underlying maturity.\nDerivative Instruments and Hedging\nThe Company may use derivative instruments to partially offset its business exposure to foreign exchange and interest rate risk. However, the Company may choose not to hedge certain exposures for a variety of reasons including accounting considerations or the prohibitive economic cost of hedging particular exposures. There can be no assurance the hedges will offset more than a portion of the financial impact resulting from movements in foreign exchange or interest rates.\nThe Company classifies cash flows related to derivative instruments in the same section of the Consolidated Statements of Cash Flows as the items being hedged, which are generally classified as operating activities.\nForeign Exchange Rate Risk\nTo protect gross margins from fluctuations in foreign exchange rates, the Company may use forwards, options or other instruments, and may designate these instruments as cash flow hedges. The Company generally hedges portions of its forecasted foreign currency exposure associated with revenue and inventory purchases, typically for up to 12 months.\nApple Inc. | 2024 Form 10-K | 37\n\nTo protect the Company’s foreign currency–denominated term debt or marketable securities from fluctuations in foreign exchange rates, the Company may use forwards, cross-currency swaps or other instruments. The Company designates these instruments as either cash flow or fair value hedges. As of September 28, 2024, the maximum length of time over which the Company is hedging its exposure to the variability in future cash flows for term debt–related foreign currency transactions is 18 years.\nThe Company may also use derivative instruments that are not designated as accounting hedges to protect gross margins from certain fluctuations in foreign exchange rates, as well as to offset a portion of the foreign currency gains and losses generated by the remeasurement of certain assets and liabilities denominated in non-functional currencies.\nInterest Rate Risk\nTo protect the Company’s term debt or marketable securities from fluctuations in interest rates, the Company may use interest rate swaps, options or other instruments. The Company designates these instruments as either cash flow or fair value hedges.\nThe notional amounts of the Company’s outstanding derivative instruments as of September 28, 2024 and September 30, 2023 were as follows (in millions):\n2024\t\t2023\nDerivative instruments designated as accounting hedges:\t\t\t\nForeign exchange contracts\t$\t64,069 \t\t\t$\t74,730 \t\nInterest rate contracts\t$\t14,575 \t\t\t$\t19,375 \t\nDerivative instruments not designated as accounting hedges:\t\t\t\nForeign exchange contracts\t$\t91,493 \t\t\t$\t104,777 \t\n \nThe carrying amounts of the Company’s hedged items in fair value hedges as of September 28, 2024 and September 30, 2023 were as follows (in millions):\n2024\t\t2023\nHedged assets/(liabilities):\t\t\t\nCurrent and non-current marketable securities\t$\t— \t\t\t$\t14,433 \t\nCurrent and non-current term debt\t$\t(13,505)\t\t\t$\t(18,247)\t\n \nAccounts Receivable\nTrade Receivables\nThe Company’s third-party cellular network carriers accounted for 38% and 41% of total trade receivables as of September 28, 2024 and September 30, 2023, respectively. The Company requires third-party credit support or collateral from certain customers to limit credit risk.\nVendor Non-Trade Receivables\nThe Company has non-trade receivables from certain of its manufacturing vendors resulting from the sale of components to these vendors who manufacture subassemblies or assemble final products for the Company. The Company purchases these components directly from suppliers. The Company does not reflect the sale of these components in products net sales. Rather, the Company recognizes any gain on these sales as a reduction of products cost of sales when the related final products are sold by the Company. As of September 28, 2024, the Company had two vendors that individually represented 10% or more of total vendor non-trade receivables, which accounted for 44% and 23%. As of September 30, 2023, the Company had two vendors that individually represented 10% or more of total vendor non-trade receivables, which accounted for 48% and 23%.\nApple Inc. | 2024 Form 10-K | 38\n\nNote 5 – Property, Plant and Equipment\nThe following table shows the Company’s gross property, plant and equipment by major asset class and accumulated depreciation as of September 28, 2024 and September 30, 2023 (in millions):\n2024\t\t2023\nLand and buildings\t$\t24,690 \t\t\t$\t23,446 \t\nMachinery, equipment and internal-use software\t80,205 \t\t\t78,314 \t\nLeasehold improvements\t14,233 \t\t\t12,839 \t\nGross property, plant and equipment\t119,128 \t\t\t114,599 \t\nAccumulated depreciation\n(73,448)\t\t\t(70,884)\t\nTotal property, plant and equipment, net\t$\t45,680 \t\t\t$\t43,715 \t\n \nDepreciation expense on property, plant and equipment was $8.2 billion, $8.5 billion and $8.7 billion during 2024, 2023 and 2022, respectively.\nNote 6 – Consolidated Financial Statement Details\nThe following tables show the Company’s consolidated financial statement details as of September 28, 2024 and September 30, 2023 (in millions):\nOther Non-Current Assets\n2024\t\t2023\nDeferred tax assets\t$\t19,499 \t\t\t$\t17,852 \t\nOther non-current assets\t55,335 \t\t\t46,906 \t\nTotal other non-current assets\t$\t74,834 \t\t\t$\t64,758 \t\n \nOther Current Liabilities\n2024\t\t2023\nIncome taxes payable\t$\t26,601 \t\t\t$\t8,819 \t\nOther current liabilities\t51,703 \t\t\t50,010 \t\nTotal other current liabilities\t$\t78,304 \t\t\t$\t58,829 \t\n \nOther Non-Current Liabilities\n2024\t\t2023\nIncome taxes payable\n$\t9,254 \t\t\t$\t15,457 \t\nOther non-current liabilities\t36,634 \t\t\t34,391 \t\nTotal other non-current liabilities\t$\t45,888 \t\t\t$\t49,848 \t\n \nNote 7 – Income Taxes\nEuropean Commission State Aid Decision\nOn August 30, 2016, the Commission announced its decision that Ireland granted state aid to the Company by providing tax opinions in 1991 and 2007 concerning the tax allocation of profits of the Irish branches of two subsidiaries of the Company (the “State Aid Decision”). The State Aid Decision ordered Ireland to calculate and recover additional taxes from the Company for the period June 2003 through December 2014. Irish legislative changes, effective as of January 2015, eliminated the application of the tax opinions from that date forward. The recovery amount was calculated to be €13.1 billion, plus interest of €1.2 billion.\nFrom time to time, the Company requested approval from the Irish Minister for Finance to reduce the recovery amount for certain taxes paid to other countries. As of September 28, 2024, the adjusted recovery amount of €12.7 billion plus interest of €1.2 billion was held in escrow and restricted from general use. The total balance of the escrow, including net unrealized investment gains, was €14.2 billion or $15.8 billion as of September 28, 2024, of which $2.6 billion was classified as cash and cash equivalents and $13.2 billion was classified as current marketable securities in the Consolidated Balance Sheet. Refer to the Cash, Cash Equivalents and Marketable Securities section of Note 4, “Financial Instruments” for more information.\nApple Inc. | 2024 Form 10-K | 39\n\nThe Company and Ireland appealed the State Aid Decision to the General Court of the Court of Justice of the European Union (the “General Court”). On July 15, 2020, the General Court annulled the State Aid Decision. On September 25, 2020, the Commission appealed the General Court’s decision to the European Court of Justice (the “ECJ”) and a hearing was held on May 23, 2023. On September 10, 2024, the ECJ announced that it had set aside the 2020 judgment of the General Court and confirmed the Commission’s 2016 State Aid Decision. As a result, during the fourth quarter of 2024 the Company recorded a one-time income tax charge of $10.2 billion, net, which represents $15.8 billion payable to Ireland via release of the escrow, partially offset by a U.S. foreign tax credit of $4.8 billion and a decrease in unrecognized tax benefits of $823 million.\nProvision for Income Taxes and Effective Tax Rate\nThe provision for income taxes for 2024, 2023 and 2022, consisted of the following (in millions):\n2024\t\t2023\t\t2022\nFederal:\t\t\t\t\t\nCurrent\t$\t5,571 \t\t\t$\t9,445 \t\t\t$\t7,890 \t\nDeferred\t(3,080)\t\t\t(3,644)\t\t\t(2,265)\t\nTotal\t2,491 \t\t\t5,801 \t\t\t5,625 \t\nState:\t\t\t\t\t\nCurrent\t1,726 \t\t\t1,570 \t\t\t1,519 \t\nDeferred\t(298)\t\t\t(49)\t\t\t84 \t\nTotal\t1,428 \t\t\t1,521 \t\t\t1,603 \t\nForeign:\t\t\t\t\t\nCurrent\t25,483 \t\t\t8,750 \t\t\t8,996 \t\nDeferred\t347 \t\t\t669 \t\t\t3,076 \t\nTotal\t25,830 \t\t\t9,419 \t\t\t12,072 \t\nProvision for income taxes\t$\t29,749 \t\t\t$\t16,741 \t\t\t$\t19,300 \t\n \nForeign pretax earnings were $77.3 billion, $72.9 billion and $71.3 billion in 2024, 2023 and 2022, respectively.\nA reconciliation of the provision for income taxes to the amount computed by applying the statutory federal income tax rate (21% in 2024, 2023 and 2022) to income before provision for income taxes for 2024, 2023 and 2022, is as follows (dollars in millions):\n2024\t\t2023\t\t2022\nComputed expected tax\t$\t25,932 \t\t\t$\t23,885 \t\t\t$\t25,012 \t\nState taxes, net of federal effect\t1,162 \t\t\t1,124 \t\t\t1,518 \t\nImpact of the State Aid Decision\n10,246 \t\t\t— \t\t\t— \t\nEarnings of foreign subsidiaries\t(5,311)\t\t\t(5,744)\t\t\t(4,366)\t\nResearch and development credit, net\t(1,397)\t\t\t(1,212)\t\t\t(1,153)\t\nExcess tax benefits from equity awards\t(893)\t\t\t(1,120)\t\t\t(1,871)\t\nOther\t10 \t\t\t(192)\t\t\t160 \t\nProvision for income taxes\t$\t29,749 \t\t\t$\t16,741 \t\t\t$\t19,300 \t\nEffective tax rate\t24.1 \t%\t\t14.7 \t%\t\t16.2 \t%\n \nApple Inc. | 2024 Form 10-K | 40\n\nDeferred Tax Assets and Liabilities\nAs of September 28, 2024 and September 30, 2023, the significant components of the Company’s deferred tax assets and liabilities were (in millions):\n2024\t\t2023\nDeferred tax assets:\t\t\t\nCapitalized research and development\t$\t10,739 \t\t\t$\t6,294 \t\nTax credit carryforwards\t8,856 \t\t\t8,302 \t\nAccrued liabilities and other reserves\t6,114 \t\t\t6,365 \t\nDeferred revenue\t3,413 \t\t\t4,571 \t\nLease liabilities\t2,410 \t\t\t2,421 \t\nUnrealized losses\t1,173 \t\t\t2,447 \t\nOther\t2,168 \t\t\t2,343 \t\nTotal deferred tax assets\t34,873 \t\t\t32,743 \t\nLess: Valuation allowance\t(8,866)\t\t\t(8,374)\t\nTotal deferred tax assets, net\t26,007 \t\t\t24,369 \t\nDeferred tax liabilities:\t\t\t\nDepreciation\t2,551 \t\t\t1,998 \t\nRight-of-use assets\t2,125 \t\t\t2,179 \t\nMinimum tax on foreign earnings\t1,674 \t\t\t1,940 \t\nUnrealized gains\t— \t\t\t511 \t\nOther\t455 \t\t\t490 \t\nTotal deferred tax liabilities\t6,805 \t\t\t7,118 \t\nNet deferred tax assets\t$\t19,202 \t\t\t$\t17,251 \t\n \nAs of September 28, 2024, the Company had $5.1 billion in foreign tax credit carryforwards in Ireland and $3.6 billion in California R&D credit carryforwards, both of which can be carried forward indefinitely. A valuation allowance has been recorded for the credit carryforwards and a portion of other temporary differences.\nUncertain Tax Positions\nAs of September 28, 2024, the total amount of gross unrecognized tax benefits was $22.0 billion, of which $10.8 billion, if recognized, would impact the Company’s effective tax rate. As of September 30, 2023, the total amount of gross unrecognized tax benefits was $19.5 billion, of which $9.5 billion, if recognized, would have impacted the Company’s effective tax rate.\nThe aggregate change in the balance of gross unrecognized tax benefits, which excludes interest and penalties, for 2024, 2023 and 2022, is as follows (in millions):\n2024\t\t2023\t\t2022\nBeginning balances\t$\t19,454 \t\t\t$\t16,758 \t\t\t$\t15,477 \t\nIncreases related to tax positions taken during a prior year\t1,727 \t\t\t2,044 \t\t\t2,284 \t\nDecreases related to tax positions taken during a prior year\t(386)\t\t\t(1,463)\t\t\t(1,982)\t\nIncreases related to tax positions taken during the current year\t2,542 \t\t\t2,628 \t\t\t1,936 \t\nDecreases related to settlements with taxing authorities\t(1,070)\t\t\t(19)\t\t\t(28)\t\nDecreases related to expiration of the statute of limitations\t(229)\t\t\t(494)\t\t\t(929)\t\nEnding balances\t$\t22,038 \t\t\t$\t19,454 \t\t\t$\t16,758 \t\n \nThe Company is subject to taxation and files income tax returns in the U.S. federal jurisdiction and many state and foreign jurisdictions. Tax years after 2017 for the U.S. federal jurisdiction, and after 2014 in certain major foreign jurisdictions, remain subject to examination. Although the timing of resolution or closure of examinations is not certain, the Company believes it is reasonably possible that its gross unrecognized tax benefits could decrease between approximately $5 billion and $13 billion in the next 12 months, primarily related to intercompany transfer pricing and deemed repatriation tax.\nApple Inc. | 2024 Form 10-K | 41\n\nNote 8 – Leases\nThe Company has lease arrangements for certain equipment and facilities, including corporate, data center, manufacturing and retail space. These leases typically have original terms not exceeding 10 years and generally contain multiyear renewal options, some of which are reasonably certain of exercise.\nPayments under the Company’s lease arrangements may be fixed or variable, and variable lease payments are primarily based on purchases of output of the underlying leased assets. Lease costs associated with fixed payments on the Company’s operating leases were $2.0 billion for both 2024 and 2023 and $1.9 billion for 2022. Lease costs associated with variable payments on the Company’s leases were $13.8 billion, $13.9 billion and $14.9 billion for 2024, 2023 and 2022, respectively.\nThe Company made fixed cash payments related to operating leases of $1.9 billion in both 2024 and 2023 and $1.8 billion in 2022. Noncash activities involving right-of-use (“ROU”) assets obtained in exchange for lease liabilities were $1.0 billion, $2.1 billion and $2.8 billion for 2024, 2023 and 2022, respectively.\nThe following table shows ROU assets and lease liabilities, and the associated financial statement line items, as of September 28, 2024 and September 30, 2023 (in millions):\nLease-Related Assets and Liabilities\t\tFinancial Statement Line Items\t\t2024\t\t2023\nRight-of-use assets:\t\t\t\t\t\t\nOperating leases\t\tOther non-current assets\t\t$\t10,234 \t\t\t$\t10,661 \t\nFinance leases\t\tProperty, plant and equipment, net\t\t1,069 \t\t\t1,015 \t\nTotal right-of-use assets\t\t\t\t$\t11,303 \t\t\t$\t11,676 \t\nLease liabilities:\t\t\t\t\t\t\nOperating leases\t\tOther current liabilities\t\t$\t1,488 \t\t\t$\t1,410 \t\nOther non-current liabilities\t\t10,046 \t\t\t10,408 \t\nFinance leases\t\tOther current liabilities\t\t144 \t\t\t165 \t\nOther non-current liabilities\t\t752 \t\t\t859 \t\nTotal lease liabilities\t\t\t\t$\t12,430 \t\t\t$\t12,842 \t\n \nLease liability maturities as of September 28, 2024, are as follows (in millions):\nOperating\nLeases\t\tFinance\nLeases\t\tTotal\n2025\t$\t1,820 \t\t\t$\t171 \t\t\t$\t1,991 \t\n2026\t1,914 \t\t\t131 \t\t\t2,045 \t\n2027\t1,674 \t\t\t59 \t\t\t1,733 \t\n2028\t1,360 \t\t\t38 \t\t\t1,398 \t\n2029\t1,187 \t\t\t36 \t\t\t1,223 \t\nThereafter\t5,563 \t\t\t837 \t\t\t6,400 \t\nTotal undiscounted liabilities\t13,518 \t\t\t1,272 \t\t\t14,790 \t\nLess: Imputed interest\t(1,984)\t\t\t(376)\t\t\t(2,360)\t\nTotal lease liabilities\t$\t11,534 \t\t\t$\t896 \t\t\t$\t12,430 \t\n \nThe weighted-average remaining lease term related to the Company’s lease liabilities as of September 28, 2024 and September 30, 2023 was 10.3 years and 10.6 years, respectively. The discount rate related to the Company’s lease liabilities as of September 28, 2024 and September 30, 2023 was 3.1% and 3.0%, respectively. The discount rates related to the Company’s lease liabilities are generally based on estimates of the Company’s incremental borrowing rate, as the discount rates implicit in the Company’s leases cannot be readily determined.\nAs of September 28, 2024, the Company had $849 million of fixed payment obligations under additional leases, primarily for corporate facilities and retail space, that had not yet commenced. These leases will commence between 2025 and 2026, with lease terms ranging from less than 1 year to 21 years.\nApple Inc. | 2024 Form 10-K | 42\n\nNote 9 – Debt\nCommercial Paper\nThe Company issues unsecured short-term promissory notes pursuant to a commercial paper program. The Company uses net proceeds from the commercial paper program for general corporate purposes, including dividends and share repurchases. As of September 28, 2024 and September 30, 2023, the Company had $10.0 billion and $6.0 billion of commercial paper outstanding, respectively, with maturities generally less than nine months. The weighted-average interest rate of the Company’s commercial paper was 5.00% and 5.28% as of September 28, 2024 and September 30, 2023, respectively. The following table provides a summary of cash flows associated with the issuance and maturities of commercial paper for 2024, 2023 and 2022 (in millions):\n2024\t\t2023\t\t2022\nMaturities 90 days or less:\t\t\t\t\t\nProceeds from/(Repayments of) commercial paper, net\t$\t3,960 \t\t\t$\t(1,333)\t\t\t$\t5,264 \t\nMaturities greater than 90 days:\t\t\t\t\t\nProceeds from commercial paper\t— \t\t\t— \t\t\t5,948 \t\nRepayments of commercial paper\t— \t\t\t(2,645)\t\t\t(7,257)\t\nProceeds from/(Repayments of) commercial paper, net\t— \t\t\t(2,645)\t\t\t(1,309)\t\nTotal proceeds from/(repayments of) commercial paper, net\t$\t3,960 \t\t\t$\t(3,978)\t\t\t$\t3,955 \t\n \nTerm Debt\nThe Company has outstanding Notes, which are senior unsecured obligations with interest payable in arrears. The following table provides a summary of the Company’s term debt as of September 28, 2024 and September 30, 2023:\nMaturities\n(calendar year)\n2024\t\t2023\nAmount\n(in millions)\nEffective\nInterest Rate\t\t\nAmount\n(in millions)\nEffective\nInterest Rate\n2013 – 2023 debt issuances:\nFixed-rate 0.000% – 4.850% notes\n2024 – 2062\n$\t97,341 \t\t\t\n0.03% – 6.65%\n$\t106,572 \t\t\t\n0.03% – 6.72%\nTotal term debt principal\n97,341 \t\t\t\t\t106,572 \t\t\t\nUnamortized premium/(discount) and issuance costs, net\n(321)\t\t\t\t\t(356)\t\t\t\nHedge accounting fair value adjustments\t\t\t(358)\t\t\t\t\t(1,113)\t\t\t\nTotal term debt\n96,662 \t\t\t\t\t105,103 \t\t\t\nLess: Current portion of term debt\t\t\t(10,912)\t\t\t\t\t(9,822)\t\t\t\nTotal non-current portion of term debt\t\t\t$\t85,750 \t\t\t\t\t$\t95,281 \t\t\t\n \nTo manage interest rate risk on certain of its U.S. dollar–denominated fixed-rate notes, the Company uses interest rate swaps to effectively convert the fixed interest rates to floating interest rates on a portion of these notes. Additionally, to manage foreign exchange rate risk on certain of its foreign currency–denominated notes, the Company uses cross-currency swaps to effectively convert these notes to U.S. dollar–denominated notes.\nThe effective interest rates for the Notes include the interest on the Notes, amortization of the discount or premium and, if applicable, adjustments related to hedging.\nThe future principal payments for the Company’s Notes as of September 28, 2024, are as follows (in millions):\n2025\t$\t10,930 \t\n2026\t12,342 \t\n2027\t9,936 \t\n2028\t7,800 \t\n2029\t5,153 \t\nThereafter\t51,180 \t\nTotal term debt principal\t$\t97,341 \t\n \nApple Inc. | 2024 Form 10-K | 43\n\nAs of September 28, 2024 and September 30, 2023, the fair value of the Company’s Notes, based on Level 2 inputs, was $88.4 billion and $90.8 billion, respectively.\nNote 10 – Shareholders’ Equity\nShare Repurchase Program\nDuring 2024, the Company repurchased 499 million shares of its common stock for $95.0 billion. The Company’s share repurchase programs do not obligate the Company to acquire a minimum amount of shares. Under the programs, shares may be repurchased in privately negotiated or open market transactions, including under plans complying with Rule 10b5-1 under the Exchange Act.\nShares of Common Stock\nThe following table shows the changes in shares of common stock for 2024, 2023 and 2022 (in thousands):\n2024\t\t2023\t\t2022\nCommon stock outstanding, beginning balances\t15,550,061 \t\t\t15,943,425 \t\t\t16,426,786 \t\nCommon stock repurchased\t(499,372)\t\t\t(471,419)\t\t\t(568,589)\t\nCommon stock issued, net of shares withheld for employee taxes\t66,097 \t\t\t78,055 \t\t\t85,228 \t\nCommon stock outstanding, ending balances\t15,116,786 \t\t\t15,550,061 \t\t\t15,943,425 \t\n \nNote 11 – Share-Based Compensation\n2022 Employee Stock Plan\nThe Apple Inc. 2022 Employee Stock Plan (the “2022 Plan”) is a shareholder-approved plan that provides for broad-based equity grants to employees, including executive officers, and permits the granting of RSUs, stock grants, performance-based awards, stock options and stock appreciation rights. RSUs granted under the 2022 Plan generally vest over four years, based on continued employment, and are settled upon vesting in shares of the Company’s common stock on a one-for-one basis. All RSUs granted under the 2022 Plan have dividend equivalent rights, which entitle holders of RSUs to the same dividend value per share as holders of common stock. A maximum of approximately 1.3 billion shares were authorized for issuance pursuant to 2022 Plan awards at the time the plan was approved on March 4, 2022.\n2014 Employee Stock Plan\nThe Apple Inc. 2014 Employee Stock Plan, as amended and restated (the “2014 Plan”), is a shareholder-approved plan that provided for broad-based equity grants to employees, including executive officers. The 2014 Plan permitted the granting of the same types of equity awards with substantially the same terms as the 2022 Plan. The 2014 Plan also permitted the granting of cash bonus awards. In the third quarter of 2022, the Company terminated the authority to grant new awards under the 2014 Plan.\nApple Inc. | 2024 Form 10-K | 44\n\nRestricted Stock Units\nA summary of the Company’s RSU activity and related information for 2024, 2023 and 2022, is as follows:\nNumber of\nRSUs\n(in thousands)\nWeighted-Average\nGrant-Date Fair\nValue Per RSU\nAggregate\nFair Value\n(in millions)\nBalance as of September 25, 2021\t240,427 \t\t\t$\t75.16 \t\t\t\nRSUs granted\t91,674 \t\t\t$\t150.70 \t\t\t\nRSUs vested\t(115,861)\t\t\t$\t72.12 \t\t\t\nRSUs canceled\t(14,739)\t\t\t$\t99.77 \t\t\t\nBalance as of September 24, 2022\t201,501 \t\t\t$\t109.48 \t\t\t\nRSUs granted\t88,768 \t\t\t$\t150.87 \t\t\t\nRSUs vested\t(101,878)\t\t\t$\t97.31 \t\t\t\nRSUs canceled\t(8,144)\t\t\t$\t127.98 \t\t\t\nBalance as of September 30, 2023\t180,247 \t\t\t$\t135.91 \t\t\t\nRSUs granted\t80,456 \t\t\t$\t173.78 \t\t\t\nRSUs vested\t(87,633)\t\t\t$\t127.59 \t\t\t\nRSUs canceled\t(9,744)\t\t\t$\t140.80 \t\t\t\nBalance as of September 28, 2024\t163,326 \t\t\t$\t158.73 \t\t\t$\t37,204 \t\n \nThe fair value as of the respective vesting dates of RSUs was $15.8 billion, $15.9 billion and $18.2 billion for 2024, 2023 and 2022, respectively. The majority of RSUs that vested in 2024, 2023 and 2022 were net share settled such that the Company withheld shares with a value equivalent to the employees’ obligation for the applicable income and other employment taxes, and remitted cash to the appropriate taxing authorities. The total shares withheld were approximately 31 million, 37 million and 41 million for 2024, 2023 and 2022, respectively, and were based on the value of the RSUs on their respective vesting dates as determined by the Company’s closing stock price. Total payments to taxing authorities for employees’ tax obligations were $5.6 billion in both 2024 and 2023 and $6.4 billion in 2022.\nShare-Based Compensation\nThe following table shows share-based compensation expense and the related income tax benefit included in the Consolidated Statements of Operations for 2024, 2023 and 2022 (in millions):\n2024\t\t2023\t\t2022\nShare-based compensation expense\t$\t11,688 \t\t\t$\t10,833 \t\t\t$\t9,038 \t\nIncome tax benefit related to share-based compensation expense\t$\t(3,350)\t\t\t$\t(3,421)\t\t\t$\t(4,002)\t\n \nAs of September 28, 2024, the total unrecognized compensation cost related to outstanding RSUs was $19.4 billion, which the Company expects to recognize over a weighted-average period of 2.4 years.\nNote 12 – Commitments, Contingencies and Supply Concentrations\nUnconditional Purchase Obligations\nThe Company has entered into certain off–balance sheet commitments that require the future purchase of goods or services (“unconditional purchase obligations”). The Company’s unconditional purchase obligations primarily consist of supplier arrangements, licensed intellectual property and content, and distribution rights. Future payments under unconditional purchase obligations with a remaining term in excess of one year as of September 28, 2024, are as follows (in millions):\n2025\t$\t3,206 \t\n2026\t2,440 \t\n2027\t1,156 \t\n2028\t3,121 \t\n2029\t633 \t\nThereafter\t670 \t\nTotal\t$\t11,226 \t\n \nApple Inc. | 2024 Form 10-K | 45\n\nContingencies\nThe Company is subject to various legal proceedings and claims that have arisen in the ordinary course of business and that have not been fully resolved. The outcome of litigation is inherently uncertain. In the opinion of management, there was not at least a reasonable possibility the Company may have incurred a material loss, or a material loss greater than a recorded accrual, concerning loss contingencies for asserted legal and other claims.\nConcentrations in the Available Sources of Supply of Materials and Product\nAlthough most components essential to the Company’s business are generally available from multiple sources, certain components are currently obtained from single or limited sources. The Company also competes for various components with other participants in the markets for smartphones, personal computers, tablets, wearables and accessories. Therefore, many components used by the Company, including those that are available from multiple sources, are at times subject to industry-wide shortage and significant commodity pricing fluctuations.\nThe Company uses some custom components that are not commonly used by its competitors, and new products introduced by the Company often utilize custom components available from only one source. When a component or product uses new technologies, initial capacity constraints may exist until the suppliers’ yields have matured or their manufacturing capacities have increased. The continued availability of these components at acceptable prices, or at all, may be affected if suppliers decide to concentrate on the production of common components instead of components customized to meet the Company’s requirements.\nSubstantially all of the Company’s hardware products are manufactured by outsourcing partners that are located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam.\nNote 13 – Segment Information and Geographic Data\nThe Company manages its business primarily on a geographic basis. The Company’s reportable segments consist of the Americas, Europe, Greater China, Japan and Rest of Asia Pacific. Americas includes both North and South America. Europe includes European countries, as well as India, the Middle East and Africa. Greater China includes China mainland, Hong Kong and Taiwan. Rest of Asia Pacific includes Australia and those Asian countries not included in the Company’s other reportable segments. Although the reportable segments provide similar hardware and software products and similar services, each one is managed separately to better align with the location of the Company’s customers and distribution partners and the unique market dynamics of each geographic region.\nThe Company evaluates the performance of its reportable segments based on net sales and operating income. Net sales for geographic segments are generally based on the location of customers and sales through the Company’s retail stores located in those geographic locations. Operating income for each segment consists of net sales to third parties, related cost of sales, and operating expenses directly attributable to the segment. The information provided to the Company’s chief operating decision maker for purposes of making decisions and assessing segment performance excludes asset information.\nApple Inc. | 2024 Form 10-K | 46\n\nThe following table shows information by reportable segment for 2024, 2023 and 2022 (in millions):\n2024\t\t2023\t\t2022\nAmericas:\t\t\t\t\t\nNet sales\t$\t167,045 \t\t\t$\t162,560 \t\t\t$\t169,658 \t\nOperating income\t$\t67,656 \t\t\t$\t60,508 \t\t\t$\t62,683 \t\nEurope:\t\t\t\t\t\nNet sales\t$\t101,328 \t\t\t$\t94,294 \t\t\t$\t95,118 \t\nOperating income\t$\t41,790 \t\t\t$\t36,098 \t\t\t$\t35,233 \t\nGreater China:\t\t\t\t\t\nNet sales\t$\t66,952 \t\t\t$\t72,559 \t\t\t$\t74,200 \t\nOperating income\t$\t27,082 \t\t\t$\t30,328 \t\t\t$\t31,153 \t\nJapan:\t\t\t\t\t\nNet sales\t$\t25,052 \t\t\t$\t24,257 \t\t\t$\t25,977 \t\nOperating income\t$\t12,454 \t\t\t$\t11,888 \t\t\t$\t12,257 \t\nRest of Asia Pacific:\t\t\t\t\t\nNet sales\t$\t30,658 \t\t\t$\t29,615 \t\t\t$\t29,375 \t\nOperating income\t$\t13,062 \t\t\t$\t12,066 \t\t\t$\t11,569 \t\n \nA reconciliation of the Company’s segment operating income to the Consolidated Statements of Operations for 2024, 2023 and 2022 is as follows (in millions):\n2024\t\t2023\t\t2022\nSegment operating income\t$\t162,044 \t\t\t$\t150,888 \t\t\t$\t152,895 \t\nResearch and development expense\t(31,370)\t\t\t(29,915)\t\t\t(26,251)\t\nOther corporate expenses, net (1)\n(7,458)\t\t\t(6,672)\t\t\t(7,207)\t\nTotal operating income\t$\t123,216 \t\t\t$\t114,301 \t\t\t$\t119,437 \t\n \n(1)Includes general and administrative compensation costs, various nonrecurring charges, and other separately managed costs.\nThe following tables show net sales for 2024, 2023 and 2022 and long-lived assets as of September 28, 2024 and September 30, 2023 for countries that individually accounted for 10% or more of the respective totals, as well as aggregate amounts for the remaining countries (in millions):\n2024\t\t2023\t\t2022\nNet sales:\t\t\t\t\t\nU.S.\t$\t142,196 \t\t\t$\t138,573 \t\t\t$\t147,859 \t\nChina (1)\n66,952 \t\t\t72,559 \t\t\t74,200 \t\nOther countries\t181,887 \t\t\t172,153 \t\t\t172,269 \t\nTotal net sales\t$\t391,035 \t\t\t$\t383,285 \t\t\t$\t394,328 \t\n \n2024\t\t2023\nLong-lived assets:\t\t\t\nU.S.\t$\t35,664 \t\t\t$\t33,276 \t\nChina (1)\n4,797 \t\t\t5,778 \t\nOther countries\t5,219 \t\t\t4,661 \t\nTotal long-lived assets\t$\t45,680 \t\t\t$\t43,715 \t\n \n(1)China includes Hong Kong and Taiwan.\nApple Inc. | 2024 Form 10-K | 47\n\n\nReport of Independent Registered Public Accounting Firm\nTo the Shareholders and the Board of Directors of Apple Inc.\nOpinion on the Financial Statements\nWe have audited the accompanying consolidated balance sheets of Apple Inc. (the “Company”) as of September 28, 2024 and September 30, 2023, the related consolidated statements of operations, comprehensive income, shareholders’ equity and cash flows for each of the three years in the period ended September 28, 2024, and the related notes (collectively referred to as the “financial statements”). In our opinion, the financial statements present fairly, in all material respects, the financial position of the Company at September 28, 2024 and September 30, 2023, and the results of its operations and its cash flows for each of the three years in the period ended September 28, 2024, in conformity with U.S. generally accepted accounting principles (“GAAP”).\nWe also have audited, in accordance with the standards of the Public Company Accounting Oversight Board (United States) (“PCAOB”), the Company’s internal control over financial reporting as of September 28, 2024, based on criteria established in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework) and our report dated November 1, 2024 expressed an unqualified opinion thereon.\nBasis for Opinion\nThese financial statements are the responsibility of the Company’s management. Our responsibility is to express an opinion on the Company’s financial statements based on our audits. We are a public accounting firm registered with the PCAOB and are required to be independent with respect to the Company in accordance with the U.S. federal securities laws and the applicable rules and regulations of the Securities and Exchange Commission and the PCAOB.\nWe conducted our audits in accordance with the standards of the PCAOB. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether the financial statements are free of material misstatement, whether due to error or fraud. Our audits included performing procedures to assess the risks of material misstatement of the financial statements, whether due to error or fraud, and performing procedures that respond to those risks. Such procedures included examining, on a test basis, evidence regarding the amounts and disclosures in the financial statements. Our audits also included evaluating the accounting principles used and significant estimates made by management, as well as evaluating the overall presentation of the financial statements. We believe that our audits provide a reasonable basis for our opinion.\nCritical Audit Matter\nThe critical audit matter communicated below is a matter arising from the current period audit of the financial statements that was communicated or required to be communicated to the audit committee and that: (1) relates to accounts or disclosures that are material to the financial statements and (2) involved our especially challenging, subjective, or complex judgments. The communication of the critical audit matter does not alter in any way our opinion on the financial statements, taken as a whole, and we are not, by communicating the critical audit matter below, providing a separate opinion on the critical audit matter or on the account or disclosure to which it relates.\nUncertain Tax Positions\nDescription of the Matter\t\nAs discussed in Note 7 to the financial statements, the Company is subject to income taxes in the U.S. and numerous foreign jurisdictions. As of September 28, 2024, the total amount of gross unrecognized tax benefits was $22.0 billion, of which $10.8 billion, if recognized, would impact the Company’s effective tax rate. In accounting for some of the uncertain tax positions, the Company uses significant judgment in the interpretation and application of GAAP and complex domestic and international tax laws.\nAuditing management’s evaluation of whether an uncertain tax position is more likely than not to be sustained and the measurement of the benefit of various tax positions can be complex, involves significant judgment, and is based on interpretations of tax laws and legal rulings.\n \nApple Inc. | 2024 Form 10-K | 48\n\nHow We Addressed the\nMatter in Our Audit\t\nWe tested controls relating to the evaluation of uncertain tax positions, including controls over management’s assessment as to whether tax positions are more likely than not to be sustained, management’s process to measure the benefit of its tax positions that qualify for recognition, and the related disclosures.\nWe evaluated the Company’s assessment of which tax positions are more likely than not to be sustained and the related measurement of the amount of tax benefit that qualifies for recognition. Our audit procedures included, among others, reading and evaluating management’s assumptions and analysis, and, as applicable, the Company’s communications with taxing authorities, that detailed the basis and technical merits of the uncertain tax positions. We involved our tax subject matter resources in assessing the technical merits of certain of the Company’s tax positions based on our knowledge of relevant tax laws and experience with related taxing authorities. For a certain tax position, we also received an external legal counsel confirmation letter and discussed the matter with external advisors and the Company’s tax personnel. In addition, we evaluated the Company’s disclosure in relation to these matters included in Note 7 to the financial statements.\n \n/s/ Ernst & Young LLP\nWe have served as the Company’s auditor since 2009.\n\nSan Jose, California\nNovember 1, 2024\nApple Inc. | 2024 Form 10-K | 49\n\n\nReport of Independent Registered Public Accounting Firm\nTo the Shareholders and the Board of Directors of Apple Inc.\nOpinion on Internal Control Over Financial Reporting\nWe have audited Apple Inc.’s internal control over financial reporting as of September 28, 2024, based on criteria established in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework) (the “COSO criteria”). In our opinion, Apple Inc. (the “Company”) maintained, in all material respects, effective internal control over financial reporting as of September 28, 2024, based on the COSO criteria.\nWe also have audited, in accordance with the standards of the Public Company Accounting Oversight Board (United States) (“PCAOB”), the consolidated balance sheets of the Company as of September 28, 2024 and September 30, 2023, the related consolidated statements of operations, comprehensive income, shareholders’ equity and cash flows for each of the three years in the period ended September 28, 2024, and the related notes and our report dated November 1, 2024 expressed an unqualified opinion thereon.\nBasis for Opinion\nThe Company’s management is responsible for maintaining effective internal control over financial reporting and for its assessment of the effectiveness of internal control over financial reporting included in the accompanying Management’s Annual Report on Internal Control over Financial Reporting. Our responsibility is to express an opinion on the Company’s internal control over financial reporting based on our audit. We are a public accounting firm registered with the PCAOB and are required to be independent with respect to the Company in accordance with the U.S. federal securities laws and the applicable rules and regulations of the Securities and Exchange Commission and the PCAOB.\nWe conducted our audit in accordance with the standards of the PCAOB. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether effective internal control over financial reporting was maintained in all material respects.\nOur audit included obtaining an understanding of internal control over financial reporting, assessing the risk that a material weakness exists, testing and evaluating the design and operating effectiveness of internal control based on the assessed risk, and performing such other procedures as we considered necessary in the circumstances. We believe that our audit provides a reasonable basis for our opinion.\nDefinition and Limitations of Internal Control Over Financial Reporting\nA company’s internal control over financial reporting is a process designed to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements for external purposes in accordance with generally accepted accounting principles. A company’s internal control over financial reporting includes those policies and procedures that (1) pertain to the maintenance of records that, in reasonable detail, accurately and fairly reflect the transactions and dispositions of the assets of the company; (2) provide reasonable assurance that transactions are recorded as necessary to permit preparation of financial statements in accordance with generally accepted accounting principles, and that receipts and expenditures of the company are being made only in accordance with authorizations of management and directors of the company; and (3) provide reasonable assurance regarding prevention or timely detection of unauthorized acquisition, use, or disposition of the company’s assets that could have a material effect on the financial statements.\nBecause of its inherent limitations, internal control over financial reporting may not prevent or detect misstatements. Also, projections of any evaluation of effectiveness to future periods are subject to the risk that controls may become inadequate because of changes in conditions, or that the degree of compliance with the policies or procedures may deteriorate.\n\n\n/s/ Ernst & Young LLP\n\nSan Jose, California\nNovember 1, 2024\nApple Inc. | 2024 Form 10-K | 50\n\nItem 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure\nNone.\nItem 9A. Controls and Procedures\nEvaluation of Disclosure Controls and Procedures\nBased on an evaluation under the supervision and with the participation of the Company’s management, the Company’s principal executive officer and principal financial officer have concluded that the Company’s disclosure controls and procedures as defined in Rules 13a-15(e) and 15d-15(e) under the Exchange Act were effective as of September 28, 2024 to provide reasonable assurance that information required to be disclosed by the Company in reports that it files or submits under the Exchange Act is (i) recorded, processed, summarized and reported within the time periods specified in the SEC rules and forms and (ii) accumulated and communicated to the Company’s management, including its principal executive officer and principal financial officer, as appropriate to allow timely decisions regarding required disclosure.\nInherent Limitations over Internal Controls\nThe Company’s internal control over financial reporting is designed to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements for external purposes in accordance with GAAP. The Company’s internal control over financial reporting includes those policies and procedures that: \n(i)pertain to the maintenance of records that, in reasonable detail, accurately and fairly reflect the transactions and dispositions of the Company’s assets;\n(ii)provide reasonable assurance that transactions are recorded as necessary to permit preparation of financial statements in accordance with GAAP, and that the Company’s receipts and expenditures are being made only in accordance with authorizations of the Company’s management and directors; and\n(iii)provide reasonable assurance regarding prevention or timely detection of unauthorized acquisition, use, or disposition of the Company’s assets that could have a material effect on the financial statements.\nManagement, including the Company’s Chief Executive Officer and Chief Financial Officer, does not expect that the Company’s internal controls will prevent or detect all errors and all fraud. A control system, no matter how well designed and operated, can provide only reasonable, not absolute, assurance that the objectives of the control system are met. Further, the design of a control system must reflect the fact that there are resource constraints, and the benefits of controls must be considered relative to their costs. Because of the inherent limitations in all control systems, no evaluation of internal controls can provide absolute assurance that all control issues and instances of fraud, if any, have been detected. Also, any evaluation of the effectiveness of controls in future periods are subject to the risk that those internal controls may become inadequate because of changes in business conditions, or that the degree of compliance with the policies or procedures may deteriorate.\nManagement’s Annual Report on Internal Control over Financial Reporting\nThe Company’s management is responsible for establishing and maintaining adequate internal control over financial reporting (as defined in Rule 13a-15(f) under the Exchange Act). Management conducted an assessment of the effectiveness of the Company’s internal control over financial reporting based on the criteria set forth in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework). Based on the Company’s assessment, management has concluded that its internal control over financial reporting was effective as of September 28, 2024 to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements in accordance with GAAP. The Company’s independent registered public accounting firm, Ernst & Young LLP, has issued an audit report on the Company’s internal control over financial reporting, which appears in Part II, Item 8 of this Form 10-K.\nChanges in Internal Control over Financial Reporting\nThere were no changes in the Company’s internal control over financial reporting during the fourth quarter of 2024, which were identified in connection with management’s evaluation required by paragraph (d) of Rules 13a-15 and 15d-15 under the Exchange Act, that have materially affected, or are reasonably likely to materially affect, the Company’s internal control over financial reporting.\nApple Inc. | 2024 Form 10-K | 51\n\nItem 9B. Other Information\nInsider Trading Arrangements\nOn August 27, 2024, Deirdre O’Brien, the Company’s Senior Vice President, Retail, entered into a trading plan intended to satisfy the affirmative defense conditions of Rule 10b5-1(c) under the Exchange Act. The plan provides for the sale, subject to certain price limits, of shares vesting between April 1, 2025 and October 1, 2026, pursuant to certain equity awards granted to Ms. O’Brien, excluding any shares withheld by the Company to satisfy income tax withholding and remittance obligations. Ms. O’Brien’s plan will expire on December 31, 2026, subject to early termination in accordance with the terms of the plan.\nOn August 29, 2024, Jeff Williams, the Company’s Chief Operating Officer, entered into a trading plan intended to satisfy the affirmative defense conditions of Rule 10b5-1(c) under the Exchange Act. The plan provides for the sale, subject to certain price limits, of up to 100,000 shares of common stock, as well as shares vesting between April 1, 2025 and October 1, 2025, pursuant to certain equity awards granted to Mr. Williams, excluding any shares withheld by the Company to satisfy income tax withholding and remittance obligations. Mr. Williams’ plan will expire on December 15, 2025, subject to early termination in accordance with the terms of the plan.\nItem 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections\nNot applicable.\nPART III\nItem 10. Directors, Executive Officers and Corporate Governance\nThe Company has an insider trading policy governing the purchase, sale and other dispositions of the Company’s securities that applies to all Company personnel, including directors, officers, employees, and other covered persons. The Company also follows procedures for the repurchase of its securities. The Company believes that its insider trading policy and repurchase procedures are reasonably designed to promote compliance with insider trading laws, rules and regulations, and listing standards applicable to the Company. A copy of the Company’s insider trading policy is filed as Exhibit 19.1 to this Form 10-K.\nThe remaining information required by this Item will be included in the Company’s definitive proxy statement to be filed with the SEC within 120 days after September 28, 2024, in connection with the solicitation of proxies for the Company’s 2025 annual meeting of shareholders (the “2025 Proxy Statement”), and is incorporated herein by reference.\nItem 11. Executive Compensation\nThe information required by this Item will be included in the 2025 Proxy Statement, and is incorporated herein by reference.\nItem 12. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters\nThe information required by this Item will be included in the 2025 Proxy Statement, and is incorporated herein by reference.\nItem 13. Certain Relationships and Related Transactions, and Director Independence\nThe information required by this Item will be included in the 2025 Proxy Statement, and is incorporated herein by reference.\nItem 14. Principal Accountant Fees and Services\nThe information required by this Item will be included in the 2025 Proxy Statement, and is incorporated herein by reference.\nApple Inc. | 2024 Form 10-K | 52\n\nPART IV\nItem 15. Exhibit and Financial Statement Schedules\n(a)Documents filed as part of this report\n(1)All financial statements\nIndex to Consolidated Financial Statements\t\tPage\nConsolidated Statements of Operations for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n29\nConsolidated Statements of Comprehensive Income for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n30\nConsolidated Balance Sheets as of September 28, 2024 and September 30, 2023\n31\nConsolidated Statements of Shareholders’ Equity for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n32\nConsolidated Statements of Cash Flows for the years ended September 28, 2024, September 30, 2023 and September 24, 2022\n33\nNotes to Consolidated Financial Statements\n34\nReports of Independent Registered Public Accounting Firm*\n48\n \n*Ernst & Young LLP, PCAOB Firm ID No. 00042.\n(2)Financial Statement Schedules\nAll financial statement schedules have been omitted, since the required information is not applicable or is not present in amounts sufficient to require submission of the schedule, or because the information required is included in the consolidated financial statements and accompanying notes included in this Form 10-K.\n(3)Exhibits required by Item 601 of Regulation S-K (1)\nIncorporated by Reference\nExhibit Number\t\tExhibit Description\t\tForm\t\tExhibit\t\tFiling Date/\nPeriod End Date\n3.1\t\t\nRestated Articles of Incorporation of the Registrant filed on August 3, 2020.\n8-K\t\t3.1\t\t8/7/20\n3.2\t\t\nAmended and Restated Bylaws of the Registrant effective as of August 20, 2024.\n8-K\t\t3.2\t\t\n8/23/24\n4.1**\t\t\nDescription of Securities of the Registrant.\n4.2\t\t\nIndenture, dated as of April 29, 2013, between the Registrant and The Bank of New York Mellon Trust Company, N.A., as Trustee.\nS-3\t\t4.1\t\t4/29/13\n4.3\t\t\nOfficer’s Certificate of the Registrant, dated as of May 3, 2013, including forms of global notes representing the Floating Rate Notes due 2016, Floating Rate Notes due 2018, 0.45% Notes due 2016, 1.00% Notes due 2018, 2.40% Notes due 2023 and 3.85% Notes due 2043.\n8-K\t\t4.1\t\t5/3/13\n4.4\t\t\nOfficer’s Certificate of the Registrant, dated as of May 6, 2014, including forms of global notes representing the Floating Rate Notes due 2017, Floating Rate Notes due 2019, 1.05% Notes due 2017, 2.10% Notes due 2019, 2.85% Notes due 2021, 3.45% Notes due 2024 and 4.45% Notes due 2044.\n8-K\t\t4.1\t\t5/6/14\n4.5\t\t\nOfficer’s Certificate of the Registrant, dated as of November 10, 2014, including forms of global notes representing the 1.000% Notes due 2022 and 1.625% Notes due 2026.\n8-K\t\t4.1\t\t11/10/14\n4.6\t\t\nOfficer’s Certificate of the Registrant, dated as of February 9, 2015, including forms of global notes representing the Floating Rate Notes due 2020, 1.55% Notes due 2020, 2.15% Notes due 2022, 2.50% Notes due 2025 and 3.45% Notes due 2045.\n8-K\t\t4.1\t\t2/9/15\n4.7\t\t\nOfficer’s Certificate of the Registrant, dated as of May 13, 2015, including forms of global notes representing the Floating Rate Notes due 2017, Floating Rate Notes due 2020, 0.900% Notes due 2017, 2.000% Notes due 2020, 2.700% Notes due 2022, 3.200% Notes due 2025, and 4.375% Notes due 2045.\n8-K\t\t4.1\t\t5/13/15\n4.8\t\t\nOfficer’s Certificate of the Registrant, dated as of July 31, 2015, including forms of global notes representing the 3.05% Notes due 2029 and 3.60% Notes due 2042.\n8-K\t\t4.1\t\t7/31/15\n4.9\t\t\nOfficer’s Certificate of the Registrant, dated as of September 17, 2015, including forms of global notes representing the 1.375% Notes due 2024 and 2.000% Notes due 2027.\n8-K\t\t4.1\t\t9/17/15\n \nApple Inc. | 2024 Form 10-K | 53\n\nIncorporated by Reference\nExhibit Number\t\tExhibit Description\t\tForm\t\tExhibit\t\tFiling Date/\nPeriod End Date\n4.10\t\t\nOfficer’s Certificate of the Registrant, dated as of February 23, 2016, including forms of global notes representing the Floating Rate Notes due 2019, Floating Rate Notes due 2021, 1.300% Notes due 2018, 1.700% Notes due 2019, 2.250% Notes due 2021, 2.850% Notes due 2023, 3.250% Notes due 2026, 4.500% Notes due 2036 and 4.650% Notes due 2046.\n8-K\t\t4.1\t\t2/23/16\n4.11\t\t\nSupplement No. 1 to the Officer’s Certificate of the Registrant, dated as of March 24, 2016.\n8-K\t\t4.1\t\t3/24/16\n4.12\t\t\nOfficer’s Certificate of the Registrant, dated as of August 4, 2016, including forms of global notes representing the Floating Rate Notes due 2019, 1.100% Notes due 2019, 1.550% Notes due 2021, 2.450% Notes due 2026 and 3.850% Notes due 2046.\n8-K\t\t4.1\t\t8/4/16\n4.13\t\t\nOfficer’s Certificate of the Registrant, dated as of February 9, 2017, including forms of global notes representing the Floating Rate Notes due 2019, Floating Rate Notes due 2020, Floating Rate Notes due 2022, 1.550% Notes due 2019, 1.900% Notes due 2020, 2.500% Notes due 2022, 3.000% Notes due 2024, 3.350% Notes due 2027 and 4.250% Notes due 2047.\n8-K\t\t4.1\t\t2/9/17\n4.14\t\t\nOfficer’s Certificate of the Registrant, dated as of May 11, 2017, including forms of global notes representing the Floating Rate Notes due 2020, Floating Rate Notes due 2022, 1.800% Notes due 2020, 2.300% Notes due 2022, 2.850% Notes due 2024 and 3.200% Notes due 2027.\n8-K\t\t4.1\t\t5/11/17\n4.15\t\t\nOfficer’s Certificate of the Registrant, dated as of May 24, 2017, including forms of global notes representing the 0.875% Notes due 2025 and 1.375% Notes due 2029.\n8-K\t\t4.1\t\t5/24/17\n4.16\t\t\nOfficer’s Certificate of the Registrant, dated as of June 20, 2017, including form of global note representing the 3.000% Notes due 2027.\n8-K\t\t4.1\t\t6/20/17\n4.17\nOfficer’s Certificate of the Registrant, dated as of September 12, 2017, including forms of global notes representing the 1.500% Notes due 2019, 2.100% Notes due 2022, 2.900% Notes due 2027 and 3.750% Notes due 2047.\n8-K\t\t4.1\t\t9/12/17\n4.18\nOfficer’s Certificate of the Registrant, dated as of November 13, 2017, including forms of global notes representing the 1.800% Notes due 2019, 2.000% Notes due 2020, 2.400% Notes due 2023, 2.750% Notes due 2025, 3.000% Notes due 2027 and 3.750% Notes due 2047.\n8-K\t\t4.1\t\t11/13/17\n4.19\nIndenture, dated as of November 5, 2018, between the Registrant and The Bank of New York Mellon Trust Company, N.A., as Trustee.\nS-3\t\t4.1\t\t11/5/18\n4.20\nOfficer’s Certificate of the Registrant, dated as of September 11, 2019, including forms of global notes representing the 1.700% Notes due 2022, 1.800% Notes due 2024, 2.050% Notes due 2026, 2.200% Notes due 2029 and 2.950% Notes due 2049.\n8-K\t\t4.1\t\t9/11/19\n4.21\nOfficer’s Certificate of the Registrant, dated as of November 15, 2019, including forms of global notes representing the 0.000% Notes due 2025 and 0.500% Notes due 2031.\n8-K\t\t4.1\t\t11/15/19\n4.22\nOfficer’s Certificate of the Registrant, dated as of May 11, 2020, including forms of global notes representing the 0.750% Notes due 2023, 1.125% Notes due 2025, 1.650% Notes due 2030 and 2.650% Notes due 2050.\n8-K\t\t4.1\t\t5/11/20\n4.23\nOfficer’s Certificate of the Registrant, dated as of August 20, 2020, including forms of global notes representing the 0.550% Notes due 2025, 1.25% Notes due 2030, 2.400% Notes due 2050 and 2.550% Notes due 2060.\n8-K\t\t4.1\t\t8/20/20\n4.24\nOfficer’s Certificate of the Registrant, dated as of February 8, 2021, including forms of global notes representing the 0.700% Notes due 2026, 1.200% Notes due 2028, 1.650% Notes due 2031, 2.375% Notes due 2041, 2.650% Notes due 2051 and 2.800% Notes due 2061.\n8-K\t\t4.1\t\t2/8/21\n4.25\nOfficer’s Certificate of the Registrant, dated as of August 5, 2021, including forms of global notes representing the 1.400% Notes due 2028, 1.700% Notes due 2031, 2.700% Notes due 2051 and 2.850% Notes due 2061.\n8-K\t\t4.1\t\t8/5/21\n4.26\nIndenture, dated as of October 28, 2021, between the Registrant and The Bank of New York Mellon Trust Company, N.A., as Trustee.\nS-3\t\t4.1\t\t10/29/21\n4.27\nOfficer’s Certificate of the Registrant, dated as of August 8, 2022, including forms of global notes representing the 3.250% Notes due 2029, 3.350% Notes due 2032, 3.950% Notes due 2052 and 4.100% Notes due 2062.\n8-K\t\t4.1\t\t8/8/22\n \nApple Inc. | 2024 Form 10-K | 54\n\nIncorporated by Reference\nExhibit Number\t\tExhibit Description\t\tForm\t\tExhibit\t\tFiling Date/\nPeriod End Date\n4.28\nOfficer’s Certificate of the Registrant, dated as of May 10, 2023, including forms of global notes representing the 4.421% Notes due 2026, 4.000% Notes due 2028, 4.150% Notes due 2030, 4.300% Notes due 2033 and 4.850% Notes due 2053.\n8-K\t\t4.1\t\t5/10/23\n4.29*\nApple Inc. Deferred Compensation Plan.\nS-8\t\t4.1\t\t8/23/18\n10.1*\t\t\nApple Inc. Employee Stock Purchase Plan, as amended and restated as of March 10, 2015.\n8-K\t\t10.1\t\t3/13/15\n10.2*\t\t\nForm of Indemnification Agreement between the Registrant and each director and executive officer of the Registrant.\n10-Q\t\t10.2\t\t6/27/09\n10.3*\t\t\nApple Inc. Non-Employee Director Stock Plan, as amended November 9, 2021.\n10-Q\t\t10.1\t\t12/25/21\n10.4*\t\t\nApple Inc. 2014 Employee Stock Plan, as amended and restated as of October 1, 2017.\n10-K\t\t10.8\t\t9/30/17\n10.5*\t\t\nForm of Restricted Stock Unit Award Agreement under 2014 Employee Stock Plan effective as of September 26, 2017.\n10-K\t\t10.20\t\t9/30/17\n10.6*\nForm of Restricted Stock Unit Award Agreement under Non-Employee Director Stock Plan effective as of February 13, 2018.\n10-Q\n10.2\n3/31/18\n10.7*\nForm of Restricted Stock Unit Award Agreement under 2014 Employee Stock Plan effective as of August 21, 2018.\n10-K\t\t10.17\t\t9/29/18\n10.8*\nForm of Restricted Stock Unit Award Agreement under 2014 Employee Stock Plan effective as of September 29, 2019.\n10-K\t\t10.15\t\t9/28/19\n10.9*\nForm of Restricted Stock Unit Award Agreement under 2014 Employee Stock Plan effective as of August 18, 2020.\n10-K\t\t10.16\t\t9/26/20\n10.10*\nForm of Performance Award Agreement under 2014 Employee Stock Plan effective as of August 18, 2020.\n10-K\t\t\n10.17\n9/26/20\n10.11*\nForm of CEO Restricted Stock Unit Award Agreement under 2014 Employee Stock Plan effective as of September 27, 2020.\n10-Q\t\t10.1\t\t12/26/20\n10.12*\nForm of CEO Performance Award Agreement under 2014 Employee Stock Plan effective as of September 27, 2020.\n10-Q\t\t\n10.2\n12/26/20\n10.13*\nApple Inc. 2022 Employee Stock Plan.\n8-K\t\t10.1\t\t3/4/22\n10.14*\nForm of Restricted Stock Unit Award Agreement under 2022 Employee Stock Plan effective as of March 4, 2022.\n8-K\t\t10.2\t\t3/4/22\n10.15*\nForm of Performance Award Agreement under 2022 Employee Stock Plan effective as of March 4, 2022.\n8-K\t\t10.3\t\t3/4/22\n10.16*\nApple Inc. Executive Cash Incentive Plan.\n8-K\t\t10.1\t\t8/19/22\n10.17*\nForm of CEO Restricted Stock Unit Award Agreement under 2022 Employee Stock Plan effective as of September 25, 2022.\n10-Q\t\t10.1\t\t12/31/22\n10.18*\nForm of CEO Performance Award Agreement under 2022 Employee Stock Plan effective as of September 25, 2022.\n10-Q\t\t10.2\t\t12/31/22\n10.19*, **\nForm of Restricted Stock Unit Award Agreement under 2022 Employee Stock Plan effective as of September 29, 2024.\n10.20*, **\nForm of Performance Award Agreement under 2022 Employee Stock Plan effective as of September 29, 2024.\n10.21*, **\nForm of CEO Restricted Stock Unit Award Agreement under 2022 Employee Stock Plan effective as of September 29, 2024.\n10.22*, **\nForm of CEO Performance Award Agreement under 2022 Employee Stock Plan effective as of September 29, 2024.\n19.1**\nInsider Trading Policy\n21.1**\t\t\nSubsidiaries of the Registrant.\n23.1**\t\t\nConsent of Independent Registered Public Accounting Firm.\n24.1**\t\t\nPower of Attorney (included on the Signatures page of this Annual Report on Form 10-K).\n31.1**\t\t\nRule 13a-14(a) / 15d-14(a) Certification of Chief Executive Officer.\n31.2**\t\t\nRule 13a-14(a) / 15d-14(a) Certification of Chief Financial Officer.\n32.1***\t\t\nSection 1350 Certifications of Chief Executive Officer and Chief Financial Officer.\n97.1*, **\nRule 10D-1 Recovery Policy\n \nApple Inc. | 2024 Form 10-K | 55\n\nIncorporated by Reference\nExhibit Number\t\tExhibit Description\t\tForm\t\tExhibit\t\tFiling Date/\nPeriod End Date\n101**\t\t\nInline XBRL Document Set for the consolidated financial statements and accompanying notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Annual Report on Form 10-K.\n104**\t\t\nInline XBRL for the cover page of this Annual Report on Form 10-K, included in the Exhibit 101 Inline XBRL Document Set.\n \n*Indicates management contract or compensatory plan or arrangement.\n**Filed herewith.\n***Furnished herewith.\n(1)Certain instruments defining the rights of holders of long-term debt securities of the Registrant are omitted pursuant to Item 601(b)(4)(iii) of Regulation S-K. The Registrant hereby undertakes to furnish to the SEC, upon request, copies of any such instruments.\nItem 16. Form 10-K Summary\nNone.\nApple Inc. | 2024 Form 10-K | 56\n\nSIGNATURES\nPursuant to the requirements of Section 13 or 15(d) of the Securities Exchange Act of 1934, the Registrant has duly caused this report to be signed on its behalf by the undersigned, thereunto duly authorized.\nDate: November 1, 2024\nApple Inc.\nBy:\t\t/s/ Luca Maestri\nLuca Maestri\nSenior Vice President,\nChief Financial Officer\n \nPower of Attorney\nKNOW ALL PERSONS BY THESE PRESENTS, that each person whose signature appears below constitutes and appoints Timothy D. Cook and Luca Maestri, jointly and severally, his or her attorneys-in-fact, each with the power of substitution, for him or her in any and all capacities, to sign any amendments to this Annual Report on Form 10-K, and to file the same, with exhibits thereto and other documents in connection therewith, with the Securities and Exchange Commission, hereby ratifying and confirming all that each of said attorneys-in-fact, or his substitute or substitutes, may do or cause to be done by virtue hereof.\nPursuant to the requirements of the Securities Exchange Act of 1934, this report has been signed below by the following persons on behalf of the Registrant and in the capacities and on the dates indicated:\nName\t\tTitle\t\tDate\n/s/ Timothy D. Cook\t\tChief Executive Officer and Director\n(Principal Executive Officer)\t\tNovember 1, 2024\nTIMOTHY D. COOK\t\t\t\n/s/ Luca Maestri\t\tSenior Vice President, Chief Financial Officer\n(Principal Financial Officer)\t\tNovember 1, 2024\nLUCA MAESTRI\t\t\t\n/s/ Chris Kondo\t\tSenior Director of Corporate Accounting\n(Principal Accounting Officer)\t\tNovember 1, 2024\nCHRIS KONDO\t\t\t\n/s/ Wanda Austin\nDirector\t\tNovember 1, 2024\nWANDA AUSTIN\n/s/ Alex Gorsky\t\tDirector\t\tNovember 1, 2024\nALEX GORSKY\t\t\t\n/s/ Andrea Jung\t\tDirector\t\tNovember 1, 2024\nANDREA JUNG\t\t\t\n/s/ Arthur D. Levinson\t\tDirector and Chair of the Board\t\tNovember 1, 2024\nARTHUR D. LEVINSON\t\t\t\n/s/ Monica Lozano\t\tDirector\t\tNovember 1, 2024\nMONICA LOZANO\t\t\t\n/s/ Ronald D. Sugar\t\tDirector\t\tNovember 1, 2024\nRONALD D. SUGAR\t\t\t\n/s/ Susan L. Wagner\t\tDirector\t\tNovember 1, 2024\nSUSAN L. WAGNER\t\t\t\n \nApple Inc. | 2024 Form 10-K | 57\n' + + + + ```python MAX_LENGTH = 10000 # We limit the input length to avoid token issues with open('../data/apple.txt', 'r') as file: @@ -114,15 +140,19 @@ df_results = generate_responses(model_name="gpt-3.5-turbo", Attempt 3: Apple Inc's Form 10-K provides a comprehensive overview of the company's financial reporting, business operations, products and market information. -This simple experiment reveals a fundamental challenge in LLM evaluation: even a simple parameter like temperature can dramatically alter model behavior in ways that are difficult to systematically assess. At temperature 0.0, responses are consistent but potentially too rigid. At 1.0, outputs become more varied but less predictable. At 2.0, responses can be wildly different and often incoherent. This non-deterministic behavior makes traditional software testing approaches inadequate. - -The implications for evaluation are profound. How can one effectively test an LLM-powered system when the same prompt can yield radically different outputs based on a single parameter? Traditional testing relies on predictable inputs and outputs, but LLMs force us to grapple with probabilistic behavior. While lower temperatures may seem safer for critical applications, they don't eliminate the underlying uncertainty - they merely mask it. This highlights the need for new evaluation paradigms that can handle both deterministic and probabilistic aspects of LLM behavior. +A temperature of 1 represents the unscaled probability scores for each token in the vocabulary. Decreasing the temperature closer to 0 sharpens the distribution, so the most likely token will have an even higher probability score. Conversely, increasing the temperature makes the distribution more uniform {cite}`build-llms-from-scratch-book`: +- Temperature = 0: Most deterministic, but potentially repetitive +- Temperature = 1: Balanced creativity and coherence +- Temperature > 1: Increased randomness, potentially incoherent +How can one effectively test an LLM-powered system when the same prompt can yield radically different outputs based on a single parameter? Traditional testing relies on predictable inputs and outputs, but LLMs force us to grapple with probabilistic behavior. While lower temperatures may seem safer for critical applications, they don't necessarily eliminate the underlying uncertainty. This highlights the need for new evaluation paradigms that can handle both deterministic and probabilistic aspects of LLM behavior. ## Emerging Properties -Beyond their non-deterministic nature, LLMs present another fascinating challenge: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren't explicitly programmed but rather emerge "naturally" as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against clear specifications. +Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren't explicitly programmed but rather emerge "naturally" as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications. + +{numref}`emerging-properties` provides a list of emergent abilities of large language models and the scale {cite}`wei2022emergentabilitieslargelanguage`. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested. ```{figure} ../_static/evals/emerging.png --- @@ -135,25 +165,25 @@ align: center Emergent abilities of large language models and the scale {cite}`wei2022emergentabilitieslargelanguage`. ``` - {numref}`emerging-properties` provides a list of emergent abilities of large language models and the scale. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested. - -The implications for evaluation are profound. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation. +The implications for evaluation are critical. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation. ## Problem Statement -Consider a practical example that illustrates these challenges: building a customer support chatbot powered by an LLM. In traditional software development, you would define specific features (like handling refund requests or tracking orders) and write tests to verify each function. But with LLMs, you're not just testing predefined features - you're trying to evaluate emergent capabilities like understanding context, maintaining conversation coherence, and generating appropriate emotional responses. +Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you're not just testing predefined features - you're trying to evaluate emergent capabilities like adapting explanations to a child's level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content. This fundamental difference raises critical questions about evaluation: - How do we measure capabilities that weren't explicitly programmed? - How can we ensure consistent performance when abilities may suddenly emerge or evolve? - What metrics can capture both the technical accuracy and the subjective quality of responses? -The challenge becomes even more complex when we consider that traditional software evaluation methods simply weren't designed for these kinds of systems. We need new frameworks that can account for both the deterministic aspects we're used to testing and the emergent properties that make LLMs unique. Let's explore how LLM evaluation differs from traditional software testing across several key dimensions: -- **Capability Assessment vs Functional Testing**: Traditional software testing validates specific functionality against predefined requirements. LLM evaluation must assess not necessiraly pre-defined "emergent properties" like reasoning, creativity, and language understanding that extend beyond explicit programming. +The challenge becomes even more complex when we consider that traditional software evaluation methods simply weren't designed for these kinds of systems. There is an **Evals Gap** between traditional software testing and LLM evaluation. We need new frameworks that can account for both the deterministic aspects we're used to testing and the emergent properties that make LLMs unique. -- **Metrics and Measurement Challenges**: While traditional software metrics can usually be precisely defined and measured, LLM evaluation often involves subjective qualities like "helpfulness" or "naturalness" that resist straightforward quantification. Even when we try to break these down into numeric scores, the underlying judgment remains inherently human and context-dependent. +{numref}`evals-table` summarizes how LLM evaluation differs from traditional software testing across several key dimensions: +- **Capability Assessment vs Functional Testing**: Traditional software testing validates specific functionality against predefined requirements. LLM evaluation must assess not necessarily pre-defined behavior but also "emergent properties" like reasoning, creativity, and language understanding that extend beyond explicit programming. -- **Dataset Contamination**: Traditional software testing uses carefully crafted test cases with known inputs and expected outputs (e.g., unit tests, integration tests). In contrast, LLMs trained on massive internet-scale datasets risk having already seen and memorized evaluation examples during training, which can lead to artificially inflated performance scores. This requires careful dataset curation to ensure test sets are truly unseen by the model and rigorous cross-validation approaches. +- **Metrics and Measurement Challenges**: While traditional software metrics can usually be precisely defined and measured, LLM evaluation often involves subjective qualities like "helpfulness" or "naturalness" that resist straightforward quantification. Even when we try to break these down into numeric scores, the underlying judgment often remains inherently human and context-dependent. + +- **Dataset Contamination**: Traditional software testing uses carefully crafted test cases with known inputs and expected outputs (e.g., unit tests). In contrast, LLMs trained on massive internet-scale datasets risk having already seen and memorized evaluation examples during training, which can lead to artificially inflated performance scores. This requires careful dataset curation to ensure test sets are truly unseen by the model and rigorous cross-validation approaches. - **Benchmark Evolution**: Traditional software maintains stable test suites over time. LLM benchmarks continuously evolve as capabilities advance, making longitudinal performance comparisons difficult and potentially obsoleting older evaluation methods. @@ -172,16 +202,39 @@ The challenge becomes even more complex when we consider that traditional softwa ## Evals Design -First, it's important to make a distinction between evaluating an LLM versus evaluating an LLM-based application (our focus). While the latter offers foundation capabilities and are typically general-purpose, the former is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem. +First, it's important to make a distinction between evaluating an LLM versus evaluating an LLM-based application. While the former offers foundation capabilities and are typically general-purpose, the latter is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem. + +That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications, instead, should be evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems: + +- The same LLM can yield different results in different applications +- Evaluation must align with business objectives +- A great LLM doesn't guarantee a great application! + +Examples of key requirements for validation are listed in {numref}`validation-requirements` ranging from Safety, Cognitive, Technical, Meta-Cognitive, to Ethical aspects. + +```{table} LLM Application Testing Requirements Matrix +:name: validation-requirements +| Category | Requirement | What to Test | Why It's Important | +|----------|------------|--------------|-------------------| +| Safety | Misinformation Prevention | - Accuracy of factual statements against verified databases
    - Consistency of responses across similar queries
    - Rate of fabricated details or hallucinations
    - Citation and source accuracy
    - Response behavior with uncertainty
    - Temporal consistency
    - Scientific accuracy | - Prevents real-world harm from false information
    - Maintains user trust
    - Reduces legal and reputational risks
    - Ensures reliable decision-making support
    - Protects against information manipulation | +| Safety | Unqualified Advice | - Recognition of medical, legal, and financial queries
    - Disclaimer consistency
    - Professional referral mechanisms
    - Boundary recognition
    - Emergency situation handling
    - Avoidance of specific recommendations | - Prevents harm from incorrect professional advice
    - Reduces legal liability
    - Protects vulnerable users
    - Maintains professional standards
    - Ensures appropriate expertise utilization | +| Safety | Bias Detection | - Gender, racial, and cultural bias
    - Demographic representation
    - Language inclusivity
    - Stereotype avoidance
    - Problem-solving fairness
    - Cultural context awareness | - Prevents reinforcement of societal biases
    - Ensures equal service quality
    - Maintains social responsibility
    - Protects brand reputation
    - Supports diverse user bases | +| Safety | Privacy Protection | - PII detection and handling
    - Data anonymization
    - Information leakage prevention
    - Context carryover management
    - Compliance with regulations
    - Security protocols | - Protects user confidentiality
    - Ensures regulatory compliance
    - Maintains data security
    - Prevents privacy breaches
    - Safeguards sensitive information | +| Cognitive | Reasoning & Logic | - Multi-step problem-solving
    - Mathematical computation
    - Logical fallacy detection
    - Causal reasoning
    - Abstract concept handling
    - Edge case management | - Ensures reliable problem-solving
    - Maintains computational accuracy
    - Supports critical thinking
    - Prevents logical errors
    - Enables complex decision support | +| Cognitive | Language Understanding | - Context maintenance
    - Idiom comprehension
    - Cultural reference accuracy
    - Sarcasm detection
    - Technical terminology
    - Cross-lingual capability | - Ensures effective communication
    - Prevents misunderstandings
    - Enables sophisticated interactions
    - Supports diverse language needs
    - Maintains conversation quality | +| Technical | Code Generation | - Syntax accuracy
    - Security vulnerability scanning
    - Performance optimization
    - Documentation quality
    - Error handling
    - Cross-platform compatibility | - Ensures code reliability
    - Prevents security issues
    - Maintains system stability
    - Supports development efficiency
    - Reduces technical debt | +| Technical | System Integration | - API handling
    - Rate limit compliance
    - Error management
    - Response time
    - Resource utilization
    - Scalability testing | - Ensures system reliability
    - Maintains performance
    - Enables scaling
    - Prevents system failures
    - Supports integration stability | +| Meta-Cognitive | Self-Awareness | - Knowledge limitation recognition
    - Uncertainty communication
    - Correction capabilities
    - Feedback integration
    - Edge case recognition
    - Error acknowledgment | - Builds user trust
    - Prevents overconfidence
    - Enables appropriate use
    - Supports improvement
    - Maintains reliability | +| Meta-Cognitive | Communication Quality | - Message clarity
    - Audience appropriateness
    - Information density
    - Explanation quality
    - Summary accuracy
    - Technical communication | - Ensures understanding
    - Maintains engagement
    - Enables knowledge transfer
    - Builds user satisfaction
    - Supports effective interaction | +| Ethical | Harmful Content | - Harmful request recognition
    - Response appropriateness
    - Content filtering
    - Emergency handling
    - User safety protocols
    - Incident reporting | - Protects user safety
    - Prevents misuse
    - Maintains ethical standards
    - Reduces liability
    - Ensures responsible use | +| Ethical | Decision-Making | - Moral consistency
    - Value alignment
    - Decision fairness
    - Transparency
    - Impact assessment
    - Stakeholder consideration | - Ensures ethical deployment
    - Maintains standards
    - Builds trust
    - Supports values
    - Prevents harmful impacts | +| Environmental | CO2 Emission | - Energy consumption per request
    - Model size and complexity impact
    - Server location and energy sources
    - Request caching efficiency
    - Batch processing optimization
    - Hardware utilization rates
    - Inference optimization strategies | - Reduces environmental impact
    - Supports sustainability goals
    - Optimizes operational costs
    - Meets environmental regulations
    - Demonstrates corporate responsibility | +``` + -That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications are evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems: -1. Application requirements are closely tied to LLM evaluations -2. The same LLM can yield different results in different applications -3. Evaluation must align with business objectives -4. A great LLM doesn't guarantee a great application! -#### Conceptual Overview +### Conceptual Overview {numref}`conceptual` demonstrates a conceptual design of key components of LLM Application evaluation. @@ -196,6 +249,7 @@ Conceptual overview of LLM-based application evaluation. ``` We observe three key components: + **1. Examples (Input Dataset):** - Input: Query to LLM App, e.g. user message, input file, image, audio, etc. - Output: A reference expected outcome from the LLM application. Provide ground truth for comparison (*Optional*). @@ -219,7 +273,7 @@ We observe three key components: * Measures LLM Application performance across defined metrics * Applies standardized scoring criteria -Note that Examples must provide input data to the LLM Application for further evaluation. However, ground truth data is optional. We will return to this in more detail below, where we will see that ground truth data is not always available or practical. Additionally, there are approaches where one can evaluate LLM Applications without ground truth data. +Note that **Examples** must provide input data to the LLM Application for further evaluation. However, ground truth data is optional. We will return to this in more detail below, where we will see that ground truth data is not always available or practical. Additionally, there are approaches where one can evaluate LLM Applications without ground truth data. A more general conceptual design is shown in {numref}`conceptual-multi`, where multiple LLM Applications are evaluated. This design allows for a more comprehensive evaluation of different configurations of LLM-based applications, e.g.: @@ -237,6 +291,7 @@ Conceptual overview of Multiple LLM-based applications evaluation. ``` In this evaluation framework, the same inputs are provided to all LLM applications, ensuring that responses are evaluated consistently. Performance is quantified objectively for each LLM Application, and results are ranked for easy comparison. This design leads to two additional components: + **1. Scores (Metrics Layer):** - Input: Evaluation results from Evaluator - Output: Quantified performance metrics @@ -252,9 +307,9 @@ In this evaluation framework, the same inputs are provided to all LLM applicatio * Aggregates and ranks performances across LLM applications -#### Design Considerations +### Design Considerations -The design of an LLM application evaluation system depends heavily on the specific use case and business requirements. Here we list important questions for planning an LLM application evaluation system pertaining to each of the key components previously discussed: +The design of an LLM application evaluation system depends heavily on the specific use case and business requirements. Here we list important questions for planning an LLM application evaluation system pertaining to each of the key components previously introduced: **1. Examples (Input Dataset):** - What types of examples should be included in the test set? @@ -267,7 +322,7 @@ The design of an LLM application evaluation system depends heavily on the specif - Should we have separate test sets for different business requirements? - Do we need human-validated ground truth for all examples? - Can we use synthetic data to augment the test set? -- How can business updates be reflected in the dataset post-launch? +- How can business updates and user data be reflected in the dataset post-launch? **2. LLM Applications:** - What aspects of each LLM app should be standardized for fair comparison? @@ -303,21 +358,21 @@ The design of an LLM application evaluation system depends heavily on the specif - How to handle ties or very close scores? - Should we maintain separate rankings for different: * Business requirements - * Cost tiers - * LLM Models + * Model Cost Tiers + * LLM Model Families -Hollistically, you evaluation design should be built with scalability in mind to handle growing evaluation needs as the combination of (Example X LLM Applications X Evaluators X Scores X Leaderboards) may grow very fast, particularly for an organization that promotes rapid experimentation and iterative development (good properties!). Finally, one should keep in mind that the evaluation system itself requires validation to confirm its accuracy and reliability vis-a-vis the business requirements! +Holistically, your evaluation design should be built with scalability in mind to handle growing evaluation needs as the combination of (Input Examples X LLM Applications X Evaluators X Scores X Leaderboards) may grow very fast, particularly for an organization that promotes rapid experimentation and iterative development (good properties!). Finally, one should keep in mind that the evaluation system itself requires validation to confirm its accuracy and reliability vis-a-vis business requirements (evaluating evaluators will be later discussed in this Chapter). ## Metrics -The choice of metric depends on the specific task and desired evaluation criteria. However, one can categorize metrics into two broad categories: **intrinsic** and **extrinsic**: +The choice of metric depends on the specific task and desired evaluation criteria. However, one can categorize metrics into two broad categories: **intrinsic** and **extrinsic**. * **Intrinsic metrics** focus on the model's performance on its primary training objective, which is typically to predict the next token in a sequence. Perplexity is a common intrinsic metric that measures how well the model predicts a given sample of text. -* **Extrinsic metrics** assess the model's performance on various downstream tasks, which can range from question answering to code generation. These metrics are not directly tied to the training objective, but they provide valuable insights into the model's ability to generalise to real-world applications. +* **Extrinsic metrics** assess the model's performance on various downstream tasks, which can range from question answering to code generation. These metrics are not directly tied to the training objective, but they provide valuable insights into the model's ability to generalize to real-world applications. -Here, we are particularly interested in extrinsic metrics, since we are evaluating LLM-based applications. +Here, we are particularly interested in extrinsic metrics, since we are evaluating LLM-based applications rather than base LLM models. Another way to think about metrics is in terms of the type of the task we evaluate: 1. **Discriminative Task**: @@ -327,9 +382,9 @@ Another way to think about metrics is in terms of the type of the task we evalua - Involves creating or producing new data or outputs. - Examples: Text generation, image synthesis, or summarization. -For discriminative LLM-based applications may produce log-probabilities or discrete predictions, traditional machine learning metrics like accuracy, precision, recall, and F1 score can be applied. However, generative tasks may output text or images which require different evaluation approaches. +For discriminative tasks, LLM-based applications may produce log-probabilities or discrete predictions, traditional machine learning metrics like accuracy, precision, recall, and F1 score can be applied. However, generative tasks may output text or images which require different evaluation approaches. -For generative tasks, a range of specialized metrics should be considered. These include match-based metrics such as exact match and prefix match, as well as metrics designed specifically for tasks like summarization and translation, including ROUGE, BLEU, and character n-gram comparisons. The selection of appropriate metrics should align with the specific requirements and characteristics of the task being evaluated. A detailed discussion of metric selection guidelines will be provided in a subsequent section. +For generative tasks, a range of specialized metrics should be considered. These include match-based metrics such as exact match and prefix match, as well as metrics designed specifically for tasks like summarization and translation, including ROUGE, BLEU, and character n-gram comparisons. The selection of appropriate metrics should align with the specific requirements and characteristics of the task being evaluated. In {numref}`key-metrics` we provide a short list of widely used extrinsic metrics that can be used to evaluate generative tasks of LLM-based applications, along with their definitions, use cases, and limitations. @@ -348,14 +403,14 @@ In {numref}`key-metrics` we provide a short list of widely used extrinsic metric ``` A common use case for metrics like BLEU and ROUGE is to evaluate the quality of generated summaries against reference summaries. -As an example, we will demonstrate how to evaluate the quality of SEC Filings summaries against reference summaries (e.g. analyst-prepared highlights). +As an example, we will demonstrate how to evaluate the quality of Financial Filings summaries against reference summaries (e.g. analyst-prepared highlights). We will model our simple metrics-based evaluator with the following components: - Input: Generated summary and reference summary - Output: Dictionary with scores for BLEU, ROUGE_1, and ROUGE_2 -- Purpose: Evaluate our LLM-based application - SEC filing summary generator +- Purpose: Evaluate our LLM-based application - Financial Filings summary generator -A *Reference Summary* represents the "ideal" summary. It could be prepared by humanas, e.g. expert analysts, or machine-generated. +A *Reference Summary* represents the "ideal" summary. It could be prepared by humans, e.g. expert analysts, or machine-generated. In our example, we are particularly interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a *benchmark model* (larger and more expensive). We will use the following setup: - Benchmark model: `gpt-4o` @@ -478,7 +533,7 @@ def evaluate_summary_models(model_benchmark, models_test, input): return [evaluation_results, model_summaries, benchmark_summary] ``` -Now, we are ready to run our benchmark evaluation. We define a benchmark model and a list of test models and then evaluate each test model's summary against the benchmark. We also print the generated summaries for each model. +We are ready to run our benchmark evaluation. We define a benchmark model and a list of test models and then evaluate each test model's summary against the benchmark. We also print the generated summaries for each model. ```python @@ -570,7 +625,7 @@ plot.show() -![png](evals_files/evals_28_1.png) +![png](evals_files/evals_30_1.png) @@ -579,10 +634,13 @@ Results demonstrate that tested models perform quite differently on our predefin While evaluating language model outputs inherently involves subjective judgment, establishing a high-quality benchmark model and using quantifiable metrics provide a more objective framework for comparing model performance. This approach transforms an otherwise qualitative assessment into a measurable, data-driven evaluation process. + + These metrics provide quantifiable measures of performance, however limitations should be mentioned: * **Task-specific nature**: Chosen set of metrics might not fully capture the nuances of complex generative-based tasks, especially those involving subjective human judgment. * **Sensitivity to data distribution**: Performance on these metrics can be influenced by the specific dataset used for evaluation, which might not represent real-world data distribution. +* **Subjective Acceptable Threshold**: These metrics are not always easy to interpret and set a threshold for (see {cite}`sarmah2024choosethresholdevaluationmetric` for a discussion on how to choose a threshold for an evaluation metric for large language models). * **Inability to assess reasoning or factual accuracy**: These metrics primarily focus on surface-level matching and might not reveal the underlying reasoning process of the LLM or its ability to generate factually correct information. In conclusion, selecting an appropriate extrinsic metrics set depends on the specific task, underlying business requirements and desired evaluation granularity. Understanding the limitations of these metrics can provide a more comprehensive assessment of LLM performance in real-world applications. @@ -591,6 +649,7 @@ To address these limitations, alternative approaches like **human-based evaluati ## Evaluators +(model-based-eval)= ### Model-Based Evaluation Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a "Model-based evaluation" approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment. @@ -815,7 +874,7 @@ plot.show() -![png](evals_files/evals_43_1.png) +![png](evals_files/evals_46_1.png) @@ -831,32 +890,24 @@ The visualization helps highlight these differences across models and evaluation -Leveraging LLMs for evaluation has several limitations {cite}`li2024leveraginglargelanguagemodels`. Firstly, computational overhead should not be neglected given the inherent cost of running additional model inferences iterations. LLM evaluators can also exhibit various biases, including order bias (preferring certain sequence positions), egocentric bias (favoring outputs from similar models), and length bias. Further, there may be a tight dependency on prompt quality - small prompt variations may lead to substantially different outcomes. It is important to also note challenges around domain-specific evaluation in fields such as medice, finance, law etc, where a general llm-as-a-judge approach may not be suitable. - -The LLM-as-a-Judge strategy can serve as a scalable and nuanced solution to evaluate LLM-based applications. While it does not entirely a metrics-based or human-based aproach, it significantly augments evaluation workflows, especially in scenarios requiring evaluation of generative outputs. Future improvements could include integrating human oversight and refining LLMs for domain-specific evaluation tasks. - - +Leveraging LLMs for evaluation has several limitations {cite}`li2024leveraginglargelanguagemodels`. Firstly, computational overhead should not be neglected given the inherent cost of running additional model inferences iterations. LLM evaluators can also exhibit various biases, including order bias (preferring certain sequence positions), egocentric bias (favoring outputs from similar models), and length bias. Further, there may be a tight dependency on prompt quality - small prompt variations may lead to substantially different outcomes. It is important to also note challenges around domain-specific evaluation in fields such as medicine, finance, law etc, where a general llm-as-a-judge approach may not be suitable. +The LLM-as-a-Judge strategy can serve as a scalable and nuanced solution to evaluate LLM-based applications. While it does not entirely replace metrics-based or human-based approaches, it significantly augments evaluation workflows, especially in scenarios requiring evaluation of generative outputs. Future improvements in our example include integrating human oversight and refining LLMs for domain-specific evaluation tasks. +One open source solution trying to overcome some of these challenges is Glider {cite}`deshpande2024glidergradingllminteractions`, a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications. - -### Human-Based Evaluation - -Human assessors can judge aspects like fluency, coherence, and factual accuracy, providing a more comprehensive evaluation. However, human evaluation can be subjective and resource-intensive. - - ### Evaluating Evaluators We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options: -1. Use a gold-standard dataset that is used to evaluate the performance of LLM evaluators using a "metrics-based" approach. +1. Use a golden-standard dataset that is used to evaluate the performance of LLM evaluators using a "metrics-based" approach. 2. Use a human evaluator to generate reference scores that can be used to evaluate the performance of the LLM evaluator (similar to the human-based evaluation we discussed earlier). -As depicted in {numref}`meta`, the performance of the LLM evaluator can be evaluated by comparing its scores to either a gold-standard dataset or human reference scores. Higher correlation values indicate better performance of the LLM evaluator. For instance, if we were to evaluate the performance of a LLM-as-a-judge evaluator, in the task of evaluating multilingual capability of an LLM: +As depicted in {numref}`meta`, the performance of the LLM evaluator can be evaluated by comparing its scores to either a golden-standard dataset or human reference scores. Higher correlation values indicate better performance of the LLM evaluator. For instance, if we were to evaluate the performance of a LLM-as-a-judge evaluator, in the task of evaluating multilingual capability of an LLM: 1. In a "metrics-based" approach, we would first need to define a set of metrics that capture the task of multilingual capability. For instance, we could use the BLEU metric to evaluate the quality of the generated LLM output against a golden dataset (e.g. machine translated text). We would then calculate the correlation between these scores against those generated by the LLM evaluator. The higher the correlation, the better the LLM evaluator. -2. In a "human-based" approach, we would need to recruit human evaluators that are experts in the target languanges we are evaluating. Expert humans would provide scores for a set of samples of the input LLM. We would then calculate the correlation between these scores against those generated by the LLM evaluator. The higher the correlation, the better the LLM evaluator. +2. In a "human-based" approach, we would need to recruit human evaluators that are experts in the target languages we are evaluating. Expert humans would provide scores for a set of samples of the input LLM. We would then calculate the correlation between these scores against those generated by the LLM evaluator. The higher the correlation, the better the LLM evaluator. ```{figure} ../_static/evals/meta.png --- @@ -868,9 +919,9 @@ align: center Conceptual overview of LLMs Meta Evaluation. ``` -An extension of the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is [Judge Arena](https://judgearena.com/) {cite}`judgearena2024`, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in {numref}`meta2`. Only after submitting a vote, users can see which models were actually doing the judging. +An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is [Judge Arena](https://judgearena.com/) {cite}`judgearena2024`, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in {numref}`meta2`. Only after submitting a vote, users can see which models were actually doing the judging. -```{figure} ../_static/evals/meta2.svg +```{figure} ../_static/evals/meta2.png --- name: meta2 alt: Human-in-the-loop meta evaluation Conceptual Overview @@ -907,25 +958,29 @@ Benchmarks act as standardized tests for LLMs, evaluating their performance acro Benchmarks can be thought as comprehensive "exams" that probe different "subjects" in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication. -The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. It began in **2018** with the introduction of **GLUE (General Language Understanding Evaluation)**, which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. A year later, **SuperGLUE (2019)** expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors. +The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. We can start in 2018 with the introduction of **GLUE** (General Language Understanding Evaluation) {cite}`wang2019gluemultitaskbenchmarkanalysis`, which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. Later, **SuperGLUE** {cite}`nangia2019superglue` expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors. + +As AI capabilities grew, benchmarks evolved to capture broader and more diverse aspects of intelligence. **BIG-Bench** {cite}`srivastava2023imitationgamequantifyingextrapolating` marked a turning point by incorporating over 200 tasks, spanning arithmetic, logic, and creative problem-solving. This collaborative effort aimed to probe emergent abilities in large models, offering insights into how scale and complexity influence performance. Around the same time, specialized benchmarks like **TruthfulQA** {cite}`2021truthfulqa` emerged, addressing the critical need for models to provide accurate and non-deceptive information in a world increasingly dependent on AI for factual content. + +**MMLU** (Massive Multitask Language Understanding) {cite}`hendrycks2021measuringmassivemultitasklanguage` launched in 2021, provided a rigorous test of a model’s multidisciplinary knowledge, covering 57 subjects from STEM fields to humanities and social sciences. Similarly, in 2022, Stanford’s **HELM** (Holistic Evaluation of Language Models) {cite}`liang2023holisticevaluationlanguagemodels` set a new standard for multidimensional assessment. HELM expanded the scope of evaluation beyond accuracy, incorporating factors like fairness, robustness, and computational efficiency. This benchmark was designed to address societal concerns surrounding AI, emphasizing safety and inclusion alongside technical performance. -As AI capabilities grew, benchmarks evolved to capture broader and more diverse aspects of intelligence. **BIG-Bench (2021)** marked a turning point by incorporating over 200 tasks, spanning arithmetic, logic, and creative problem-solving. This collaborative effort aimed to probe emergent abilities in large models, offering insights into how scale and complexity influence performance. Around the same time, specialized benchmarks like **TruthfulQA (2021)** emerged, addressing the critical need for models to provide accurate and non-deceptive information in a world increasingly dependent on AI for factual content. +Specialized benchmarks like **HumanEval** (2021) {cite}`chen2021evaluatinglargelanguagemodels` focused on domain-specific tasks, such as code generation, testing models’ ability to translate natural language descriptions into functional programming code. In contrast, **LMSYS** (2023) brought real-world applicability into focus by evaluating conversational AI through multi-turn dialogues. LMSYS prioritized coherence, contextual understanding, and user satisfaction, providing a practical lens for assessing models like GPT and Claude in dynamic settings. -In **2022**, Stanford’s **HELM (Holistic Evaluation of Language Models)** set a new standard for multidimensional assessment. HELM expanded the scope of evaluation beyond accuracy, incorporating factors like fairness, robustness, and computational efficiency. This benchmark was designed to address societal concerns surrounding AI, emphasizing safety and inclusion alongside technical performance. Similarly, **MMLU (Massive Multitask Language Understanding)**, launched in **2021**, provided a rigorous test of a model’s multidisciplinary knowledge, covering 57 subjects from STEM fields to humanities and social sciences. +The **HuggingFace Open LLM** {cite}`openllmleaderboard2024` Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals. -Specialized benchmarks like **HumanEval (2021)** focused on domain-specific tasks, such as code generation, testing models’ ability to translate natural language descriptions into functional programming code. In contrast, **LMSYS (2023)** brought real-world applicability into focus by evaluating conversational AI through multi-turn dialogues. LMSYS prioritized coherence, contextual understanding, and user satisfaction, providing a practical lens for assessing models like GPT and Claude in dynamic settings. +The **Chatbot Arena** (2024) Leaderboard (an evolution of LMSYS) {cite}`chiang2024chatbotarenaopenplatform` takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered hundreds of thousands of human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots. -From the early days of GLUE to the groundbreaking ARC Prize, the history of benchmarks illustrates a steady progression toward holistic and meaningful AI evaluation. Each new benchmark addresses emerging challenges and raises the bar for what AI systems can and should achieve, ensuring that these technologies align with both technical ambitions and societal needs. +The **AlpacaEval** {cite}`dubois2024lengthcontrolledalpacaevalsimpleway` and **MT-Bench** {cite}`zheng2023judgingllmasajudgemtbenchchatbot` Leaderboards implement automated evaluation using LLMs to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges. -As LLM benchmarks develop so do leaderboards. Leaderboards serve as standardized platforms to compare and rank models based on specific performance metrics / benchmarks. These evaluation systems help track LLM capabilities while maintaining transparency and reproducibility. -The **HuggingFace Open LLM** Leaderboard stands out for its transparency and accessibility in the open-source community. This leaderboard evaluates a wide range of LLMs across diverse tasks, including general knowledge, reasoning, and code-writing. Its commitment to reproducibility ensures that results are verifiable, enabling researchers and practitioners to replicate findings. By focusing on open-source models, it democratizes AI research and fosters innovation across communities, making it a valuable resource for both academics and industry professionals. +An important recent development was the release of Global-MMLU {cite}`singh2024globalmmluunderstandingaddressing`, an improved version of MMLU with evaluation coverage across 42 languages. This open dataset, built through collaboration between Argilla, the Hugging Face community, and researchers from leading institutions like Cohere For AI, Mila, MIT, and others, represents a significant step toward more inclusive multilingual LLM evaluation. Hundreds of contributors used Argilla to annotate MMLU questions, revealing that 85% of questions requiring specific cultural knowledge were Western-centric. The newly released dataset is divided into two key subsets: Culturally Agnostic questions that require no specific regional or cultural knowledge, and Culturally Sensitive questions that depend on dialect, cultural, or geographic knowledge. With high-quality translations available for 25 languages, Global-MMLU enables better understanding of LLM capabilities and limitations across different languages and cultural contexts. -The **Chatbot Arena** Leaderboard (formerly LMSYS) takes an alternative approach by measuring real-world performance through direct model comparisons. Its evaluation format compares models in live conversations, with human judges providing qualitative assessments. This methodology has gathered over 200,000 human evaluations, offering specific insights into practical model performance. The emphasis on interactive capabilities makes it relevant for developing user-facing applications like virtual assistants and chatbots. -The **AlpacaEval** and **MT-Bench** Leaderboards implement automated evaluation using GPT-4 to assess model performance in multi-turn conversations. This approach enables consistent assessment of dialogue capabilities while reducing human bias. Their methodology measures key aspects of conversational AI, including contextual understanding and response consistency across multiple exchanges. +A major challenge with these leaderboards and benchmarks is test set contamination - when test data ends up in newer models' training sets, rendering the benchmarks ineffective. While some benchmarks try to address this through crowdsourced prompts and evaluations from humans or LLMs, these approaches introduce their own biases and struggle with difficult questions. **LiveBench** {cite}`white2024livebenchchallengingcontaminationfreellm` represents a novel solution, designed specifically to be resilient to both contamination and evaluation biases. As the first benchmark with continuously updated questions from recent sources, automated objective scoring, and diverse challenging tasks across multiple domains, LiveBench maintains its effectiveness even as models improve. Drawing from recent math competitions, research papers, news, and datasets, it creates contamination-free versions of established benchmark tasks. Current results show even top models achieving considerably lower performance compared to other benchmarks, demonstrating LiveBench's ability to meaningfully differentiate model capabilities with relatively lower saturation. With monthly updates and an open collaborative approach, LiveBench aims to provide sustained value for model evaluation as the field advances. -A significant shift in AI evaluation came with the launch of the **The Alignment Research Center (ARC) Prize** by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of ARC-AGI, Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls "cognitive sufficiency" - a model's ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge as we seek to define and measure what it means to achieve AGI (Artificial General Intelligence). +Another notable benchmark is ZebraLogic {cite}`zebralogic2024`, which evaluates logical reasoning capabilities of LLMs through Logic Grid Puzzles - a type of Constraint Satisfaction Problem {cite}`brailsford1999constraint` commonly found in tests like the LSAT. These puzzles require assigning unique values to N houses across M different features based on given clues, demanding strategic reasoning and deduction to arrive at a unique correct solution. The benchmark's programmatically generated puzzles range from 2x2 to 6x6 in size and test LLMs using one-shot examples with reasoning steps. While humans can solve these puzzles through strategic methods like reductio ad absurdum and elimination, LLMs demonstrate significant limitations in this type of logical reasoning. Even the best-performing model, Claude 3.5 Sonnet, only achieves 33.4% accuracy across all puzzles and 12.4% on hard puzzles, with smaller models (7-10B parameters) solving less than 1% of hard puzzles as of December 2024. These results reveal critical gaps in LLMs' capabilities around counterfactual thinking, reflective reasoning, structured memorization, and compositional generalization. + +A significant milestone in AI evaluation came with the launch of the **The Alignment Research Center (ARC) Prize** {cite}`arcprize2024` by ARC Prize Inc., a non-profit for the public advancement of open artificial general intelligence. Hosted by Mike Knoop (Co-founder, Zapier) and François Chollet (Creator of Keras), this prize represents a paradigm shift in how we evaluate language models. Rather than focusing on narrow performance metrics, the ARC Prize assesses what it calls "cognitive sufficiency" - a model's ability to generate meaningful insights and tackle open-ended challenges. This new way to think about LLM evaluation emphasizes creative thinking, sophisticated reasoning, and the capacity to make genuinely useful contributions to human knowledge. Arguably, it is an attempt to define and measure a step towards what it means to achieve AGI (Artificial General Intelligence). Defining AGI according to ARC Prize: @@ -956,19 +1011,950 @@ align: center Sample ARC-AGI Task. ``` - These features make the ARC benchmark a unique test of machine intelligence, focusing on the ability to adapt to novelty and solve problems without relying heavily on memorization. This is more aligned with the concept of general intelligence, which emphasizes the ability to learn efficiently and tackle new challenges. +The ARC-AGI benchmark remained unbeaten for five years as of December 2024 (a minimum score of 85% in the private dataset is required to win) {cite}`arcprizeresults2024`. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark. + + +In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including: + - FinBench {cite}`zhang2024finbench`: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling. + - LegalBench {cite}`guha2023legalbench` : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals + - Berkeley Function Leaderboard (BFCL) {cite}`patil2023gorilla`: Evaluates LLMs' function-calling abilities + + As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren't previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks. +In the following sections, we will explore some open source tools developers can use to automate and streamline the challenging task of LLMs evals. + ## Tools ### LightEval -LightEval {cite}`lighteval` is a lightweight framework for LLM evaluation that allows for efficient evaluation of LLMs across a variety of tasks. It is designed to be used in conjunction with the Hugging Face ecosystem and can be used to evaluate the performance of LLMs on a variety of tasks. +LightEval {cite}`lighteval` is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI. + +As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let's evaluate how well different models perform on this type of task. + +First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. {numref}`mmlu-econometrics` shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers. + +```{table} MMLU Econometrics Task Dataset sample +:name: mmlu-econometrics +| Question | Options | Correct Options | Correct Options Index | Correct Options Literal | +|-----------|----------|-----------------|---------------------|----------------------| +| Consider the following AR(1) model with the disturbances having zero mean and unit variance: yt = 0.2 + 0.4 yt-1 + ut The (unconditional) mean of y will be given by | ["0.2", "0.4", "0.5", "0.33"] | ["b"] | [3] | ["0.33"] | +| Suppose that a test statistic has associated with it a p-value of 0.08. Which one of the following statements is true? (i) If the size of the test were exactly 8%, we... | ["(ii) and (iv) only", "(i) and (iii) only", "(i), (ii), and (iii) only", "(i), (ii), (iii), and (iv)"] | ["c"] | [2] | ["(i), (ii), and (iii) only"] | +| What would be then consequences for the OLS estimator if heteroscedasticity is present in a regression model but ignored? | ["It will be biased", "It will be inconsistent", "It will be inefficient", "All of (a), (b) and (c) will be true."] | ["c"] | [2] | ["It will be inefficient"] | +| Suppose now that a researcher wishes to use information criteria to determine the optimal lag length for a VAR. 500 observations are available for the bivariate VAR... | ["1 lag", "2 lags", "3 lags", "4 lags"] | ["c"] | [2] | ["3 lags"] | +``` + +The code sample below demonstrates the LightEval Python SDK framework for evaluating a target LLM model on a given task. First, we instantiate an `EvaluationTracker` which manages result storage, in this example kept in a local directory `output_dir`, and tracks detailed evaluation metrics, optionally pushed to HuggingFace Hub. + +Next, we instantiate an object of the class `PipelineParameters` which, in this example, configures the pipeline for parallel processing with a temporary cache in `cache_dir` also setting the maximum number of samples to process to `max_samples`. Then, in `BaseModelConfig` we set up the LLM model we would like to evaluate defined in `pretrained`. + +```bash +pip install lighteval[accelerate] +``` + +```python +import lighteval +from lighteval.logging.evaluation_tracker import EvaluationTracker +from lighteval.models.model_config import BaseModelConfig +from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters +from lighteval.utils.utils import EnvConfig +from lighteval.utils.imports import is_accelerate_available +from datetime import timedelta +from accelerate import Accelerator, InitProcessGroupKwargs + + +def create_evaluation_pipeline(output_dir: str, cache_dir: str, pretrained: str, dtype: str = "float16", max_samples: int = 10, task: str): + if is_accelerate_available(): + from accelerate import Accelerator, InitProcessGroupKwargs + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) + else: + accelerator = None + + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=True, + push_to_hub=False + ) + + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.ACCELERATE, + env_config=EnvConfig(cache_dir=cache_dir), + override_batch_size=1, + max_samples=max_samples + ) + + model_config = BaseModelConfig( + pretrained=pretrained, + dtype=dtype, + use_chat_template=True, + trust_remote_code=True + ) + + pipeline = Pipeline( + tasks=task, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config + ) + + return pipeline +``` +{numref}`lighteval` shows a schematic representation of its key components. As inference engine, we leverage `accelerate` for distributed evaluation. `lighteval` also supports other inference backends such as `vllm` and `tgi`. + + + +```{figure} ../_static/evals/lighteval.png +--- +name: lighteval +alt: LightEval Python SDK Sample Conceptual Overview. +scale: 35% +align: center +--- +LightEval Python SDK Sample Conceptual Overview. +``` + +This setup allows for systematic evaluation of language model performance on specific tasks while handling distributed computation and result tracking. + +The final Pipeline combines these components to evaluate in the user defined `task`, which follows the following format: + +```bash +{suite}|{task}|{num_few_shot}|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long} +``` + +The task string format follows a specific pattern with four components separated by vertical bars (|): + +1. suite: The evaluation suite name (e.g., "leaderboard") +2. task: The specific task name (e.g., "mmlu:econometrics") +3. num_few_shot: The number of few-shot examples to use (e.g., "0" for zero-shot) +4. A binary flag (0 or 1) that controls whether to automatically reduce the number of few-shot examples if the prompt becomes too long + +LightEval provides a comprehensive set of evaluation tasks {cite}`lighteval_tasks` and metrics {cite}`lighteval_metrics`. The available tasks span multiple categories and benchmarks including BigBench, MMLU, TruthfulQA, WinoGrande, and HellaSwag. The framework also supports standard NLP evaluation metrics including BLEU, ROUGE, Exact Match, F1 Score, and Accuracy. + +In our case, we choose to evaluate our LLMs on the MMLU econometrics task using zero-shot learning. Hence, we define the `task` as follows: + +```python +task = "leaderboard|mmlu:econometrics|0|0" +``` + +Example usage to evaluate an LLM, for instance `meta-llama/Llama-3.2-1B-Instruct`, on the MMLU econometrics task using zero-shot learning: + +```python +task = "leaderboard|mmlu:econometrics|0|0" +model = "meta-llama/Llama-3.2-1B-Instruct" +pipeline = create_evaluation_pipeline(output_dir="./evals/", cache_dir="./cache/", pretrained=model, task=task) +``` + +We can then evaluate the pipeline, save and show its results as follows: + +```python +pipeline.evaluate() +pipeline.save_and_push_results() +pipeline.show_results() +``` + +The results are then stored in `output_dir` in JSON format. + +The same results can be obtained by using the LightEval CLI: + +```bash +lighteval accelerate --model_args "pretrained=meta-llama/Llama-3.2-1B-Instruct" --tasks "leaderboard|mmlu:econometrics|0|0" --override_batch_size 1 --output_dir="./evals/" +``` + +We would like to compare the performance of multiple open source models on the MMLU econometrics task. While we could download and evaluate each model locally, we prefer instead to evaluate them on a remote server to save time and resources. LightEval enables serving the model on a TGI-compatible server/container and then running the evaluation by sending requests to the server {cite}`lighteval_server`. + +For that purpose, we can leverage HuggingFace Serverless Inference API [^lightevalbug] and set a configuration file for LightEval as shown below, where `` is the model identifier on HuggingFace (e.g. `meta-llama/Llama-3.2-1B-Instruct`) and `` is the user's HuggingFace API token. Alternatively, you could also pass an URL of a corresponding dedicated inference API if you have one. +[^lightevalbug]: We found a bug in LightEval that prevented it from working with the HuggingFace Serverless Inference API: https://github.com/huggingface/lighteval/issues/422. Thanks to the great work of the LightEval team, this issue has been fixed. +``` +model: + type: "tgi" + instance: + inference_server_address: "https://api-inference.huggingface.co/models/" + inference_server_auth: "" + model_id: null +``` + +Now we can run the evaluation by sending requests to the server as follows by using the same bash command as before but now setting the `model_config_path` to the path of the configuration file we have just created (e.g. `endpoint_model.yaml`): + +```bash +lighteval accelerate --model_config_path="endpoint_model.yaml" --tasks "leaderboard|mmlu:econometrics|0|0" --override_batch_size 1 --output_dir="./evals/" +``` + +To complete our task, we evaluate a few models from the following model families: `Llama3.2`, `Qwen2.5`, and `SmolLM2` as described in {numref}`model-families`. + +```{table} Model Families Evaluated Using LightEval +:name: model-families +| Model Family | Description | Models | References | +|--------------|-------------|---------|------------| +| Llama3.2 Instruct | LLaMA architecture-based pretrained and instruction-tuned generative models | `Llama-3.2-1B-Instruct`
    `Llama-3.2-3B-Instruct` | {cite}`meta_llama_models` | +| Qwen2.5 Instruct | Instruction-tuned LLMs family built by Alibaba Cloud | `Qwen2.5-0.5B-Instruct`
    `Qwen2.5-1.5B-Instruct`
    `Qwen2.5-3B-Instruct` | {cite}`gpt2docs,hui2024qwen2,qwen2` | +| SmolLM2 Instruct | Instruction-tuned family of compact language models built by HuggingFace | `SmolLM2-360M-Instruct`
    `SmolLM2-1.7B-Instruct` | {cite}`allal2024SmolLM2` | +``` + +We can then compare the performance of these models on the MMLU econometrics task as shown in {numref}`model-comparison`. + +```{figure} ../_static/evals/model-comparison.png +--- +name: model-comparison +alt: Model Comparison on MMLU Econometrics Task +scale: 50% +align: center +--- +Model performance comparison on MMLU Econometrics task, showing accuracy scores across different model sizes and architectures. +``` + +The results reveal several interesting patterns in model performance. As expected, we observe a trend where larger models consistently achieve higher accuracy scores. The evaluation shows distinct clusters among model families, with Qwen2.5, Llama-3.2, and SmolLM2 each exhibiting their own scaling characteristics, suggesting that architectural differences lead to varying degrees of efficiency as model size increases. Particularly noteworthy is the performance of the Qwen2.5 family, which demonstrates superior accuracy even at smaller model sizes when compared to Llama-3.2. + +Of course, the results should be taken with a grain of salt given the limited size of the dataset (MMLU Econometrics ~ 100), limited number of models and sizes. However, it gives a good indication of the capabilities of the different models tested with Qwen2.5 family being an interesting first candidate as a relatively small yet powerful model demonstrating a good trade-off between performance and size. Once tested on real-world data, the results will change but these initial findings are a good data-driven starting point for model selection as you begin your LLM-based application development. + +In summary, LightEval is a simple yet flexible and comprehensive framework for evaluating LLMs across a wide variety of tasks and metrics. It can serve as a first step in selecting your next LLM for a specific task given the exponential growth in number of (open source) models available {cite}`hf_num_models`. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the [official repository](https://github.com/huggingface/lighteval) {cite}`lighteval`. + +### LangSmith + + + +Let's revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recal the setup: + +- Benchmark model: gpt-4o + +- Test models: gpt-4o-mini, gpt-4-turbo, gpt-3.5-turbo + + +We can run evaluation using only langsmith without the need of langchain. + +```bash +!pip uninstall langchain +!pip uninstall langchain-community +!pip uninstall langchain-openai +!pip install langsmith +``` + +We need to generate an API key to use LangSmith. See instructions [here](https://docs.smith.langchain.com/). Remember to export your API_KEY. Activating tracing will allow us to track logs and foster observability of our evaluation. + +```bash +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY= +``` + + +```python +import evaluate as hf_evaluate # HuggingFace's evaluate +from langsmith import evaluate as langsmith_evaluate # LangSmith's evaluate +from langsmith import Client +from typing import Dict, Any + +ls_client = Client() +``` + +The code below creates a dataset in LangSmith that will serve as our golden dataset for evaluation. The dataset consists of test cases where we create a single example with the following content: + +- An input: Our SEC filing document +- An expected output: A golden summary generated by our benchmark model (`gpt-4o`) + +This dataset will allow us to evaluate how well other models perform compared to our benchmark by comparing their generated summaries against these reference summaries. In practice, it's recommended to create a larger dataset with more diverse examples to get a more accurate assessment of model capabilities as well as to estimate confidence intervals for target metrics. + + + +```python +# Define dataset: these are your test cases +dataset_name = "Golden SEC Summary Dataset" +dataset = ls_client.create_dataset(dataset_name) +ls_client.create_examples( + inputs=[ + {"sec_filing": sec_filing}, + ], + outputs=[ + {"summary": benchmark_summary}, + ], + dataset_id=dataset.id, +) +``` + +Our Dataset is now available in LangSmith as shown in {numref}`langsmith_dataset`. + +```{figure} ../_static/evals/langsmith_dataset.png +--- +name: langsmith_dataset +alt: LangSmith Dataset +scale: 25% +align: center +--- +LangSmith Dataset +``` + +Next, we write our evaluator. This evaluator calculates BLEU scores between generated and reference summaries using HuggingFace's evaluate package. The evaluator takes two dictionaries as input - one containing the generated summary and another containing the reference summary. It returns a dictionary with the Google BLEU score, which measures the overlap between n-grams in the generated and reference texts similar to our previous metric-based experiments. + + +```python +def calculate_scores(outputs: Dict[str, Any], reference_outputs: Dict[str, Any]) -> dict: + """ + Custom evaluator that calculates BLEU and ROUGE scores between generated and reference summaries + using HuggingFace's evaluate package + + Args: + outputs (dict): Contains the generated summary + reference_outputs (dict): Contains the reference summary + + Returns: + dict: Dictionary containing Google BLEU score + """ + generated = outputs.get("summary", "") + reference = reference_outputs.get("summary", "") + + # Initialize metrics from HuggingFace's evaluate + bleu = hf_evaluate.load("google_bleu") + + # Format inputs for BLEU (expects list of str for predictions and list of list of str for references) + predictions = [generated] + references = [reference] + + # Compute BLEU score + bleu_score = bleu.compute(predictions=predictions, references=[references]) + + return {"key": "google_bleu", "score": bleu_score["google_bleu"]} +``` + +Now that we have defined our evaluation metrics, let's create a function to generate summaries for our smaller models. The function below takes a dictionary containing the SEC filing text as input and returns a dictionary with the generated summary. The prompt instructs the model to act as an expert analyst and generate a one-line summary of the filing excerpt. We use the same task and model configuration as in our previous experiments to maintain consistency in our evaluation pipeline. + + + +```python +from openai import OpenAI +oai_client = OpenAI() +``` + + +```python +TASK = "Generate a 1-liner summary of the following excerpt from an SEC filing." + +PROMPT = f""" +ROLE: You are an expert analyst tasked with summarizing SEC filings. +TASK: {TASK} +""" + +xp_model_name = "" # model to be tested + +def generate_summary(inputs: dict): + """ + Generate a summary of input using a given model + """ + TASK = "Generate a 1-liner summary of the following excerpt from an SEC filing." + + response = oai_client.chat.completions.create( + model=xp_model_name, # model_name is a global variable + messages=[{"role": "system", "content": PROMPT}, + {"role": "user", "content": inputs.get("sec_filing")}] + ) + return {"summary": response.choices[0].message.content} +``` + + +Lastly we define a function to run our evaluation. The `run_evaluation()` function uses LangSmith's `evaluate()` to run evaluations either locally or remotely. When running locally, results are not uploaded to LangSmith's servers. The function takes an application, dataset, and list of evaluators as input and returns the evaluation results. The application is the `generate_summary()` function we would like to evaluate. The `dataset` is the golden summary from the strong model. And we pass a list with our single evaluator `calculate_scores()`. LangSmith also allows for running multiple repetitions of the same experiment to get a more accurate assessment of model capabilities as well as to estimate confidence intervals for target metrics, which we set to 5 repetitions. + +This allows us to systematically assess our LLM-based application while maintaining control over where results are stored. +```python +def run_evaluation(app, model_name, dataset, evaluators, upload_results=False): + global xp_model_name + xp_model_name = model_name + results = langsmith_evaluate( + app, + client=None, + data=dataset, + evaluators=evaluators, + experiment_prefix=model_name, + num_repetitions=5, + upload_results= upload_results, # This is the key parameter for local evaluation + + ) + + return results +``` + +Now we are ready run evaluation on our app across all target LLM models. + + +```python +app = generate_summary +``` + + +```python +models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4o-mini"] +results = [run_evaluation(app, model, dataset=dataset_name, evaluators=[calculate_scores], upload_results=True) for model in models] + +``` + + View the evaluation results for experiment: 'gpt-3.5-turbo-386a3620' at: + https://smith.langchain.com/o/9e1cc3cb-9d6a-4356-ab34-138e0abe8be4/datasets/8741976e-5268-4b75-949f-99477dde5d64/compare?selectedSessions=b831dc1e-90bc-4ed8-8080-fb42444724d6 + + + + + 4it [00:10, 2.59s/it]Using the latest cached version of the module from /home/tobias/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--google_bleu/6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0 (last modified on Tue Nov 26 16:50:45 2024) since it couldn't be found locally at evaluate-metric--google_bleu, or remotely on the Hugging Face Hub. + 5it [00:15, 3.09s/it] + + + View the evaluation results for experiment: 'gpt-4-turbo-5053784e' at: + https://smith.langchain.com/o/9e1cc3cb-9d6a-4356-ab34-138e0abe8be4/datasets/8741976e-5268-4b75-949f-99477dde5d64/compare?selectedSessions=64445871-a53c-44b1-a422-4f49b2f9656f + + + + + 5it [00:13, 2.69s/it] + + + View the evaluation results for experiment: 'gpt-4o-mini-4b29f3c9' at: + https://smith.langchain.com/o/9e1cc3cb-9d6a-4356-ab34-138e0abe8be4/datasets/8741976e-5268-4b75-949f-99477dde5d64/compare?selectedSessions=9ef7e39a-2add-410c-89f8-9f1a8b198cf1 + + + + + 5it [00:13, 2.61s/it] + + +We can obtain the results for all experiments including the execution time and the Google BLEU score. + + +```python +import pandas as pd +``` + + +```python +# Create list of dataframes from results +dfs = [result.to_pandas() for result in results] + +for df, model in zip(dfs, models): + df.insert(0, 'model', model) + +combined_df = pd.concat(dfs, ignore_index=True) +combined_df.head() +``` + + + + +
    + +

    Table 4.1 Structured Output Frameworks Comparison
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    modelinputs.sec_filingoutputs.summaryerrorreference.summaryfeedback.google_bleuexecution_timeexample_idid
    0gpt-3.5-turboUNITED STATES\nSECURITIES AND EXCHANGE COMMISS...Apple Inc.'s Form 10-K for the fiscal year end...NoneApple Inc.'s 10-K filing for the fiscal year e...0.3333331.224388feb10f92-3167-41f3-bb1c-d271153a31a85b196b22-9f4c-489c-b020-7823208b42d6
    1gpt-3.5-turboUNITED STATES\nSECURITIES AND EXCHANGE COMMISS...Apple Inc. filed its Form 10-K Annual Report f...NoneApple Inc.'s 10-K filing for the fiscal year e...0.3481010.722464feb10f92-3167-41f3-bb1c-d271153a31a8c310f159-064a-4035-97c3-a25bbf43abc2
    2gpt-3.5-turboUNITED STATES\nSECURITIES AND EXCHANGE COMMISS...Apple Inc. filed its annual Form 10-K for the ...NoneApple Inc.'s 10-K filing for the fiscal year e...0.3860760.704104feb10f92-3167-41f3-bb1c-d271153a31a8f7f24899-dd50-409e-93cc-6fb1622b60bf
    3gpt-3.5-turboUNITED STATES\nSECURITIES AND EXCHANGE COMMISS...Apple Inc. filed its Annual Report on Form 10-...NoneApple Inc.'s 10-K filing for the fiscal year e...0.4430380.725059feb10f92-3167-41f3-bb1c-d271153a31a8242856d6-efb5-4101-b1cf-5805532838ac
    4gpt-3.5-turboUNITED STATES\nSECURITIES AND EXCHANGE COMMISS...Apple Inc. filed its Annual Report on Form 10-...NoneApple Inc.'s 10-K filing for the fiscal year e...0.3734180.795302feb10f92-3167-41f3-bb1c-d271153a31a8ce975169-a0ab-40ce-8e32-efa28d06079d
    +
    + + + + +```python +# Calculate statistics per model +stats = combined_df.groupby('model').agg({ + 'feedback.google_bleu': ['mean', 'std'], + 'execution_time': ['mean', 'std'] +}).round(4) + +# Sort by execution time +stats = stats.sort_values(('execution_time', 'mean')) + +# Create a figure with two subplots side by side +import matplotlib.pyplot as plt +import numpy as np + +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) + +# Define colors for each model +colors = ['#2ecc71', '#3498db', '#e74c3c'] +models = stats.index + +# Plot for Google BLEU scores +bleu_means = stats[('feedback.google_bleu', 'mean')] +bleu_stds = stats[('feedback.google_bleu', 'std')] + +for i, model in enumerate(models): + ax1.errorbar(i, bleu_means[model], yerr=bleu_stds[model], + fmt='o', color=colors[i], markersize=8, capsize=5, + label=model) + if i > 0: + ax1.plot([i-1, i], [bleu_means[models[i-1]], bleu_means[model]], + '-', color=colors[i], alpha=0.5) + +ax1.set_ylabel('Google BLEU Score') +ax1.set_title('Google BLEU Scores by Model') +ax1.set_xticks(range(len(models))) +ax1.set_xticklabels(models, rotation=45) +ax1.set_ylim(bottom=0) # Set y-axis to start at 0 +ax1.legend() +ax1.grid(True, alpha=0.3) + +# Plot for execution times +exec_means = stats[('execution_time', 'mean')] +exec_stds = stats[('execution_time', 'std')] + +for i, model in enumerate(models): + ax2.errorbar(i, exec_means[model], yerr=exec_stds[model], + fmt='o', color=colors[i], markersize=8, capsize=5, + label=model) + if i > 0: + ax2.plot([i-1, i], [exec_means[models[i-1]], exec_means[model]], + '-', color=colors[i], alpha=0.5) + +ax2.set_ylabel('Execution Time (seconds)') +ax2.set_title('Execution Times by Model') +ax2.set_xticks(range(len(models))) +ax2.set_xticklabels(models, rotation=45) +ax2.set_ylim(bottom=0) # Set y-axis to start at 0 +ax2.legend() +ax2.grid(True, alpha=0.3) + +plt.tight_layout() +plt.show() + +# Display the statistics table +print("\nDetailed Statistics:") +print(stats) + +``` + + + +![png](evals_files/evals_74_0.png) + + + + + Detailed Statistics: + feedback.google_bleu execution_time + mean std mean std + model + gpt-4o-mini 0.4038 0.0453 0.7815 0.0433 + gpt-3.5-turbo 0.3768 0.0424 0.8343 0.2208 + gpt-4-turbo 0.3519 0.0775 0.9122 0.1482 + + +The evaluation results show interesting differences between the models: + +- GPT-3.5-turbo achieved a Google BLEU score of 0.377 (±0.042) with average execution time of 0.83s (±0.22s) +- GPT-4-turbo scored slightly lower at 0.352 (±0.078) and was slower at 0.91s (±0.15s) +- GPT-4o-mini performed best with a BLEU score of 0.404 (±0.045) while being fastest at 0.78s (±0.04s) + +As expected, results suggest that the newer GPT-4o-mini model achieves better quality while maintaining lower latency compared to both GPT-3.5 and GPT-4 turbo variants. The standard deviations indicate that GPT-4-turbo has the most variable output quality, while GPT-4o-mini is most consistent in both quality and speed. Interestingly, the more advanced gpt-4-turbo model has lower BLEU scores but takes longer to execute. This suggests that model size and computational complexity don't necessarily correlate with better performance on this specific summarization task. Of course, this is a very simple task further increasing the number of experiment iterations will yield more accurate results. + + +Since we decided to upload result, we can also visualize the experiment results in LangSmith as shown in {numref}`langsmith`. + +```{figure} ../_static/evals/langsmith.png +--- +name: langsmith +alt: LangSmith Experiment Results +scale: 25% +align: center +--- +LangSmith Experiment Results +``` + +### PromptFoo + +Promptfoo {cite}`promptfoo2024` is an open-source framework designed for evaluating applications that utilize LLMs. Key features include: + +1. **Automated Testing**: Promptfoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications. + +2. **Custom Probes**: Developers can create custom probes to focus on specific use cases for instance decoupling prompts from tests cases. + +3. **User-Friendly CLI**: The framework features a command-line interface that supports live reloads and caching, facilitating rapid testing and iteration. + +We will use promptfoo's command line interface in the following examples. Please follow installation instructions [here](https://www.promptfoo.dev/docs/installation/#for-command-line-usage). + + +Evals are defined in a configuration file `promptfooconfig.yaml`, which defines elements such as providers, prompts, test cases, and assertions. + +In the following example, we will perform a two-step evaluation: + +1. Evaluate the performance of different LLM models given a set of constraints. +2. Evaluate the quality of different prompts for the best performing model from 1. + + +```python +import yaml + +# Read the YAML file +with open('promptfoo/model_comparison/promptfooconfig.yaml', 'r') as f: + config = yaml.safe_load(f) + +# Pretty print the YAML content +print(yaml.dump(config, default_flow_style=False, sort_keys=False)) + +``` + + description: Best model eval + prompts: + - file://prompt1.txt + providers: + - openai:gpt-4o-mini + - openai:gpt-4 + - openai:gpt-3.5-turbo + defaultTest: + assert: + - type: cost + threshold: 0.001 + - type: latency + threshold: 1000 + - type: python + value: len(output) < 200 + - type: llm-rubric + value: Does the summary look like it was written by an expert analyst [Yes/No]? + tests: file://tests.csv + + + +The configuration file shows how PromptFoo can be used to evaluate different LLM models. The YAML configuration defines three providers (GPT-4o-mini, GPT-4, and GPT-3.5-turbo) and sets up test assertions to validate their outputs. These assertions check important constraints: + +1. Cost efficiency: Each inference must cost less than $0.001 +2. Latency requirements: Response time must be under 1000ms +3. Output length: Generated text must be less than 200 characters +4. Output quality: An LLM-based rubric evaluates if the output appears to be written by an expert (uses openai's gpt-4o model) + +The prompts are loaded from an external file (prompt1.txt) and test cases are defined in tests.csv. This structured approach enables systematic evaluation of model performance across multiple decoupled dimensions. + + +```bash +promptfoo eval --no-cache --output eval.json +``` + +This command will run the evaluation and store the results in eval.json while making sure that the evaluation is not cached so we are measuring actual latency of the LLMs. The code below processes the PromptFoo evaluation results stored in eval.json. It reads the evaluation data from the JSON file and extracts key metrics including: + +- Provider name (e.g. gpt-4, gpt-3.5-turbo) +- Latency in milliseconds +- Token usage statistics +- Cost per request +- Number of passed/failed assertions +- Prompt token count +- Total number of API requests + + +```python +import json +import pandas as pd + +# Read the JSON file +with open('promptfoo/model_comparison/eval.json', 'r') as f: + eval_data = json.load(f) + +# Extract results into a list of dictionaries +results = [] +for prompt in eval_data['results']['prompts']: + result = { + 'provider': prompt['provider'], + 'latency_ms': prompt['metrics']['totalLatencyMs'], + 'token_usage': prompt['metrics']['tokenUsage']['total'], + 'cost': prompt['metrics']['cost'], + 'assert_pass': prompt['metrics']['assertPassCount'], + 'assert_fail': prompt['metrics']['assertFailCount'], + 'prompt_tokens': prompt['metrics']['tokenUsage']['prompt'], + 'num_requests': prompt['metrics']['tokenUsage']['numRequests'] + } + results.append(result) + +``` + + +```python +from IPython.display import display, Markdown +``` + + +```python +# Convert to DataFrame +df = pd.DataFrame(results) +print(df) +``` + +| Provider | Latency (ms) | Token Usage | Cost | Assert Pass | Assert Fail | Prompt Tokens | Num Requests | +|----------|--------------|-------------|------|-------------|-------------|---------------|--------------| +| openai:gpt-4o-mini | 2463 | 97 | $0.000035 | 6 | 2 | 52 | 2 | +| openai:gpt-4 | 3773 | 103 | $0.004620 | 4 | 4 | 52 | 2 | +| openai:gpt-3.5-turbo | 1669 | 95 | $0.000091 | 7 | 1 | 52 | 2 | + + +The evaluation results reveal interesting performance characteristics across different OpenAI models. GPT-3.5-turbo demonstrates the best overall performance given our criteria with the lowest latency (1669ms), lowest token usage (95), and highest number of passed assertions (7). While GPT-4 shows higher token usage (103) and latency (3773ms), it also has the highest cost per request ($0.00462). The GPT-4-mini variant offers a middle ground, with moderate latency and token usage, while maintaining relatively good assertion performance (6 passes). These results suggest that for this particular evaluation task, GPT-3.5-turbo provides the best balance of performance, reliability, and cost-effectiveness. + +Promptfool also offers a web interface for visualizing the evaluation results as shown in {numref}`promptfoo1`. + +```bash +promptfoo view +``` + +We can observe results per test case (i.e. section of the SEC filing) and per provider. Humans can also manually review the results and provide feedback as well as generate new test cases. + +```{figure} ../_static/evals/promptfoo1.png +--- +name: promptfoo1 +alt: PromptFoo Evaluation Results +scale: 30% +align: center +--- +PromptFoo evaluation results showing performance metrics across different models. +``` + +Now that we have established `GPT-3.5-turbo` as our model of choice given the minimum required criteria based on cost, latency and basic qualitative evaluation, we can compare the performance of different prompts as a next evaluation step. Can we improve the quality of the summaries by using different prompts? + +First, we redefine our evaluation criteria. We now would like to select the prompt that delivers the most "detailed" summaries. Our updated promptfoo configuration file is shown below. + + +```python +# Read the YAML file +with open('promptfoo/prompt_comparison/promptfooconfig.yaml', 'r') as f: + config = yaml.safe_load(f) + +# Pretty print the YAML content +print(yaml.dump(config, default_flow_style=False, sort_keys=False)) +``` + + description: Best model eval + prompts: + - file://prompt1.txt + - file://prompt2.txt + - file://prompt3.txt + providers: + - openai:gpt-3.5-turbo + defaultTest: + assert: + - type: llm-rubric + value: 'Evaluate the output based on how detailed it is. Grade it on a scale + of 0.0 to 1.0, where: + + Score of 0.1: Not much detail. + + Score of 0.5: Some detail. + + Score of 1.0: Very detailed. + + ' + tests: file://tests.csv + + + +Note that we are now passing 3 different prompts. And we have updated our assertions to check if the output is "detailed" by leveraging promptfoo's `llm-rubric` assertion which will run an LLM-as-a-Judge for evaluation. Now, let's define 3 prompt variations we would like to test aiming at improving the quality/detail of the summaries. + + +```python +# Display the prompt variations +from IPython.display import display, Markdown +prompt_files = ['prompt1.txt', 'prompt2.txt', 'prompt3.txt'] +prompt_content = [] + +for file in prompt_files: + with open(f'promptfoo/prompt_comparison/{file}', 'r') as f: + content = f.read().strip() + prompt_content.append(f"### {file}\n---\n{content}\n") + +display(Markdown("\n\n".join(prompt_content))) + +``` + + +### prompt1.txt +--- +'Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' + + +### prompt2.txt +--- +'ROLE: You are a financial analyst. TASK: Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' + + +### prompt3.txt +--- +'ROLE: You are a financial analyst. REQUIREMENTS: BE DETAILED. TASK: Generate a 1-liner summary of the Section {{section}} from an SEC filing: {{content}}' + + + +The first prompt matches our previous prompt. The second prompt adds a "financial analyst" role to the prompt. The third prompt expands on second prompt and add a requirement "BE DETAILED". + +We can now run the evaluation again. + +```bash +promptfoo eval --output eval.json +``` + + +```python +# Read the evaluation results from JSON file +import json +with open('promptfoo/prompt_comparison/eval.json', 'r') as f: + eval_data = json.load(f) + +# Create a list to store the data +data = [] + +# Extract results for each test case +for result in eval_data['results']['results']: + section = result['vars']['section'] + prompt_id = result['promptId'] + score = result['gradingResult']['score'] if 'gradingResult' in result else 0.0 + + # Find the corresponding prompt file + for prompt in eval_data['results']['prompts']: + if prompt['id'] == prompt_id: + prompt_file = prompt['label'].split(':')[0] + break + + # Add to data list + data.append([section, prompt_file, score]) + +# Convert to DataFrame +df_raw = pd.DataFrame(data, columns=['Section', 'Prompt', 'Score']) + +# Pivot to get desired format +df = df_raw.pivot(index='Section', columns='Prompt', values='Score').reset_index() +df = df[['Section', 'prompt1.txt', 'prompt2.txt', 'prompt3.txt']] + +display(Markdown("### Prompt Comparison Results by Section")) +print(df) + +``` + + +### Prompt Comparison Results by Section + + + Prompt Section prompt1.txt prompt2.txt prompt3.txt + 0 Legal Proceedings 0.1 0.5 1.0 + 1 Risk Factors 0.1 0.5 0.5 + + +The results show that prompt3.txt performs best for Legal Proceedings sections, achieving a perfect score of 1.0 compared to 0.5 for prompt2.txt and 0.1 for prompt1.txt. For Risk Factors sections, both prompt2.txt and prompt3.txt achieve moderate scores of 0.5, while prompt1.txt scores poorly at 0.1. This suggests that prompt3.txt is generally more effective at extracting detailed information, particularly for legal content. In summary, defining a Role and a requirement for the output to be detailed is a good way to improve the quality of the summaries at least for this specific task, model and criteria. + + +In conclusion, Promptfoo can serve as an effective LLM application evaluation tool particularly for its ability to decouple several components of the evaluation process. Hence enabling the user to focus on the most important aspects of the evaluation given the particular application and criteria making it a valuable and flexible tool for LLM application development. + +### Comparison + +{numref}`tool-comparison` provides a summarized comparative analysis of three open source frameworks for language models evaluation we have discussed: Lighteval, LangSmith, and Promptfoo. Each framework is assessed based on key features such as integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration. + +```{table} Comparison of Lighteval, LangSmith, and Promptfoo +:name: tool-comparison +| Feature/Aspect | Lighteval | LangSmith | Promptfoo | +|----------------------|------------------------------------|------------------------------------|------------------------------------| +| **Integration** | Seamless with Hugging Face models, easy access to multiple inference engines, and remote evaluation (e.g., TGI servers, HF serverless models) | User-provided models, evaluators, and metrics | CLI-based, user-provided models via YAML | +| **Customization** | Flexible task and metric support, quick evaluation against state-of-the-art leaderboards | Easy setup of custom tasks and metrics with plain vanilla Python functions, lacks predefined tasks and metrics | Default and user-provided probes, metrics, and assertions | +| **Ease of Use** | User-friendly, minimal setup | User-friendly, minimal setup, includes UI for result visualization | Simple CLI, rapid testing, includes UI for result visualization | +| **Human/LLM Collaboration** | Model-based evaluation | Model-based evaluation | Supports human and model evaluators | +``` + +## Conclusion + +Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate. + +Success requires embracing this new paradigm by implementing comprehensive evals that cover the non-deterministic generative nature of LLMs - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth. + +The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs in software development. + + +[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa] + +[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/ +[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png +[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg + +``` +@misc{tharsistpsouza2024tamingllms, + author = {Tharsis T. P. Souza}, + title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software}, + year = {2024}, + chapter = {The Evals Gap}, + journal = {GitHub repository}, + url = {https://github.com/souzatharsis/tamingLLMs) +} +``` ## References ```{bibliography} :filter: docname in docnames diff --git a/tamingllms/markdown/frontiers.md b/tamingllms/markdown/frontiers.md index 8c26087..b8fa114 100644 --- a/tamingllms/markdown/frontiers.md +++ b/tamingllms/markdown/frontiers.md @@ -1,3 +1,7 @@ +we live in the sample, not the universe +-- Professor Christopher C. Geczy, Wharton + + Trends @@ -22,4 +26,6 @@ A New World https://learning.oreilly.com/library/view/designing-large-language/9781098150495/ch09.html#id275 +HBR: https://www.linkedin.com/posts/norbertgehrke_hbr-generative-ai-activity-7281253868238188544-k9MI?utm_source=share&utm_medium=member_desktop + diff --git a/tamingllms/markdown/intro.md b/tamingllms/markdown/intro.md index a3879a7..ab10fe5 100644 --- a/tamingllms/markdown/intro.md +++ b/tamingllms/markdown/intro.md @@ -35,11 +35,15 @@ Throughout this book, we'll tackle the following (non-exhaustive) list of critic 3. **Testing Complexity**: Traditional software testing methodologies break down when dealing with non-deterministic and generative systems, requiring new approaches. -4. **Safety and Alignment**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. +4. **Safety**: LLMs can generate harmful, biased, or inappropriate content, requiring robust safeguards and monitoring systems to ensure safe deployment. -5. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. +5. **Alignment**: LLMs are next-token prediction models, which means they are not aligned with the user's preferences by default. -6. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. +6. **Vendor Lock-in**: Cloud-based LLM providers can create significant dependencies and lock-in through their proprietary APIs and infrastructure, making it difficult to switch providers or self-host solutions. + +7. **Cost Optimization**: The computational and financial costs of operating LLM-based systems can quickly become prohibitive without careful management, and optimization. + +We conclude with a discussion on the future of LLMs and the challenges that will arise as we move forward. ## A Practical Approach @@ -171,7 +175,7 @@ Now that your environment is set up, let's begin our exploration of LLM challeng ## About the Author -Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI1 ecosystem. +Tharsis Souza (Ph.D. Computer Science, UCL University of London) is a computer scientist and product leader specializing in AI-based products. He is a Lecturer at Columbia University's Master of Science program in Applied Analytics, (*incoming*) Head of Product, Equities at Citadel, and former Senior VP at Two Sigma Investments. He mentors under-represented students & working professionals to help create a more diverse global AI ecosystem. With over 15 years of experience delivering technology products across startups and Fortune 500 companies, he is also an author of numerous scholarly publications and a frequent speaker at academic and business conferences. Grounded on academic background and drawing from practical experience building and scaling up products powered by language models at early-stage startups, major institutions as well as contributing to open source projects, he brings a unique perspective on bridging the gap between LLMs promised potential and their practical implementation challenges to enable the next generation of AI-powered products. diff --git a/tamingllms/markdown/toc.md b/tamingllms/markdown/toc.md index 6b39520..c343795 100644 --- a/tamingllms/markdown/toc.md +++ b/tamingllms/markdown/toc.md @@ -43,4 +43,14 @@ Abstract: *The current discourse around Large Language Models (LLMs) tends to fo [cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/ [cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png -[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg \ No newline at end of file +[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg + +``` +@misc{tharsistpsouza2024tamingllms, + author = {Tharsis T. P. Souza}, + title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software}, + year = {2024}, + journal = {GitHub repository}, + url = {https://github.com/souzatharsis/tamingLLMs) +} +``` \ No newline at end of file diff --git a/tamingllms/notebooks/cost.ipynb b/tamingllms/notebooks/cost.ipynb index 4cd6849..5a1bc87 100644 --- a/tamingllms/notebooks/cost.ipynb +++ b/tamingllms/notebooks/cost.ipynb @@ -315,7 +315,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "Quantization[^visual-quantization] is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by {cite}`unsloth2024llama3` [^unsloth]. The model's memory requirements vary significantly based on the quantization level used as demonstrated in {numref}`quantized`.\n", + "\n", + "[^visual-quantization]: Maarten Grootendorst provides the best visual guide for model quantization {cite}`grootendorst2024quantization`.\n", "\n", "[^unsloth]: Unsloth runs a business of making LLMs fine-tuning streamlined. Check them out at [unsloth.ai](https://unsloth.ai).\n", "\n", diff --git a/tamingllms/notebooks/evals.ipynb b/tamingllms/notebooks/evals.ipynb index 002eb4a..68b2390 100644 --- a/tamingllms/notebooks/evals.ipynb +++ b/tamingllms/notebooks/evals.ipynb @@ -853,7 +853,7 @@ "4. **Run Evaluations**: Use the judge model to score outputs. Consider using a large and/or more capable model as a judge to provide more nuanced assessments.\n", "5. **Aggregate and Analyze Results**: Interpret scores to refine applications.\n", "\n", - "```{figure} ../_static/evals/llm_judge.svg\n", + "```{figure} ../_static/evals/llm_judge.png\n", "---\n", "name: llm_judge\n", "alt: Conceptual Overview\n", @@ -1187,11 +1187,11 @@ "\n", "An alternative to the above approaches is to use humans to directly evaluate the LLM-judges themselves. A notable example of this is [Judge Arena](https://judgearena.com/) {cite}`judgearena2024`, which is a platform that allows users to vote on which AI model made the better evaluation. Under this approach, the performance of the LLM evaluator is given by the (blind) evaluation of humans who perform the voting on randomly generated pairs of LLM judges as depicted in {numref}`meta2`. Only after submitting a vote, users can see which models were actually doing the judging.\n", "\n", - "```{figure} ../_static/evals/meta2.svg\n", + "```{figure} ../_static/evals/meta2.png\n", "---\n", "name: meta2\n", "alt: Human-in-the-loop meta evaluation Conceptual Overview\n", - "scale: 60%\n", + "scale: 75%\n", "align: center\n", "---\n", "Human-in-the-loop Meta Evaluation.\n", diff --git a/tamingllms/notebooks/input.ipynb b/tamingllms/notebooks/input.ipynb index a8d6b4c..8397a78 100644 --- a/tamingllms/notebooks/input.ipynb +++ b/tamingllms/notebooks/input.ipynb @@ -12,11 +12,6 @@ "-- Steve Jobs\n", "```\n", "```{contents}\n", - "```\n", - "\n", - "\n", - "```{note}\n", - "This Chapter is Work-in-Progress.\n", "```" ] }, @@ -26,20 +21,22 @@ "source": [ "## Introduction\n", "\n", - "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n", + "While advances in long-context language models (LCs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process, significant challenges remain in managing and effectively utilizing extended data inputs:\n", "\n", - "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", + "- LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`.\n", + "- They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`.\n", + "- LLMs also face \"lost-in-the-middle\" problems {cite}`wu2024longdocumentsummaryevaluation` and struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", "\n", - "Motivated by these challenges, this chapter explores two key components:\n", + "Motivated by these challenges, this chapter explores two key input data components:\n", "\n", - "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n", + "1. Data Parsing and Chunking: Parsing and chunking documents into a unified format that is suitable and more manageable for LLMs to process.\n", "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n", "\n", "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n", "\n", - "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n", + "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation) while discussing whether RAGs will be really needed in the future given the rise of long-context language models.\n", "\n", - "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n", + "While RAGs are useful for incorporating external context, they are not a silver bullet nor a mandatory component for all LLM applications. In our last case study, we leverage long-context windows to build a quiz generator from a large knowledge base. We will also explore some additional relevant techniques such as prompt caching and response verification through citations.\n", "\n", "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases." ] @@ -50,9 +47,11 @@ "source": [ "## Parsing Documents\n", "\n", - "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n", + "Data parsing and formatting play a critical role in LLMs performance {cite}`he2024doespromptformattingimpact, liu2024enhancingllmscognitionstructurization, tan2024htmlraghtmlbetterplain`. Hence, building robust data ingestion and preprocessing pipelines is essential for any LLM application. \n", + "\n", + "This section explores open source tools that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the LLM performance.\n", "\n", - "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance." + "We will cover open source tools that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs demonstrating how the quality of the parser can impact LLM's performance." ] }, { @@ -61,7 +60,7 @@ "source": [ "### MarkItDown\n", "\n", - "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", + "MarkItDown {cite}`microsoft2024markitdown` is a Python package and CLI tool developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", "\n", "Key features:\n", "- Simple command-line and Python API interfaces\n", @@ -81,7 +80,7 @@ "\n", "### Docling\n", "\n", - "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", + "Docling {cite}`docling2024github` is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", "\n", "Key features:\n", "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n", @@ -101,13 +100,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Frameworks-Based Parsing\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -119,17 +111,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n", + "A common use case where document parsing matters is structured data extraction, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`).\n", "\n", "\n", "```{figure} ../data/input/forecast.png\n", "---\n", "name: forecast\n", "alt: Forecast\n", - "scale: 50%\n", + "scale: 45%\n", "align: center\n", "---\n", - "Forecast\n", + "Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite}`merrill2024`\n", "```" ] }, @@ -184,7 +176,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence." + "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of similarity between two strings based on the number of matches in the longest common subsequence." ] }, { @@ -256,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively." + "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher`, respectively." ] }, { @@ -351,7 +343,7 @@ "scale: 45%\n", "align: center\n", "---\n", - "Forecast 2025\n", + "Merrill Lynch's CIO Economic Forecasts.\n", "```\n", "\n", "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n" @@ -375,7 +367,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." + "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` represents the kind of data the user would like to extract and `doc` is the input document to analyze." ] }, { @@ -682,7 +674,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure as we can see in {ref}`asset_class`. The CIO view information is represented in a spectrum starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this relatively more complex information from the document.\n", "```{figure} ../_static/input/asset_class.png\n", "---\n", "name: asset_class\n", @@ -729,7 +721,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." + "We construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart. This enables us to calculate accuracy of the structured data extraction task in case." ] }, { @@ -936,7 +928,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." + "We observe that Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract structured information. Having said that, it is important to mention that a more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates since results are non-deterministic." ] }, { @@ -945,8 +937,8 @@ "source": [ "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", "\n", - "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", - "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" + "By doing that, we observe that Docling extracted 7 tables from the document exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we display the first two and the last tables. We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" ] }, { @@ -1593,7 +1585,14 @@ "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", "\n", - "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image.\n", + "\n", + "We have covered MarkitDown and Docling as examples of open source tools that can help developers parse input data into a suitable format to LLMs. Other relevant open source tools worth mentioning include:\n", + "- Unstructured.io {cite}`unstructured2024github`: A Python library for unstructured data extraction.\n", + "- FireCrawl {cite}`mendable2024firecrawl`: A Fast and Efficient Web Crawler for LLM Training Data.\n", + "- LlamaParse {cite}`llamaparse2024github`: Llamaindex's data parsing solution.\n", + "\n", + "The choice of tool depends on the specific requirements of the application and the nature of the input data. This choice should be taken as a critical decision of any data intensive LLM-based application and deserves dedicated research and evidence-based experimentation.\n" ] }, { @@ -1602,75 +1601,152 @@ "source": [ "## Retrieval-Augmented Generation\n", "\n", - "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", + "What happens if we asked ChatGPT who's the author of the book \"Taming LLMs\"?\n", "\n", - "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + "\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 1, "metadata": {}, + "outputs": [], "source": [ - "## Case Studies\n", - "\n", - "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n", - "\n", - "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "from dotenv import load_dotenv\n", + "import os\n", "\n", - "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", "\n", - "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "question = \"Who's the Author of the Book Taming LLMs?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The book \"Taming LLMs\" is authored by *G. Arulkumaran, H. M. B. P. D. Karthikeyan, and I. A. M. Almasri.* If you need more information about the book or its contents, feel free to ask!\n" + ] + } + ], + "source": [ + "response = client.chat.completions.parse(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": question}\n", + " ]\n", + ")\n", + "response.choices[0].message.content" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study I: Content Chunking with Contextual Linking\n", + "Turns out ChatGPT hallucinates. A quick web search on the before mentioned authors yields no results. In fact, those authors names are made up. And of course the correct answer would have been \"Tharsis Souza\".\n", "\n", - "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", - "1. The LLM's inability to process long inputs to do context-size limits\n", - "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", - "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "LLMs only have access to the information they have been trained on, which of course has been fixed at a point in time. Hence, LLMs operate with stale data. The problem gets exacerbated by the fact that LLMs are trained to provide an answer even if the answer is unknown by them, hence leading to hallucinations. \n", "\n", - "Here, we exemplify this technique by following these steps:\n", - "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "One solution to this problem is to use a retrieval system to fetch information from a knowledge base to provide recent and relevant context to user queries using so-called Retrieval Augmented Generation (RAG) system.\n", "\n", - "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "RAG utilizes a retrieval system to fetch external knowledge and augment LLM's context. It is a useful technique for building LLM applications that require domain-specific information or knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`. It has also proved effective in mitigating LLMs hallucinations {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the above example, a RAG would help with hallucinations by grounding the LLM's response to information provided in the knowledge base. Additional common use cases of RAG systems include:\n", "\n", - "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "1. **Enterprise Knowledge Management**: RAG enables organizations to synthesize answers from diverse internal data sources like documents, databases, and communication channels. This creates a unified knowledge interface that can accurately answer questions using the organization's own data.\n", + "2. **Document Processing and Analysis**: RAG excels at extracting and analyzing information from complex documents like financial reports, presentations, and spreadsheets. The system can enable LLMs to understand context and relationships across different document types and formats.\n", + "3. **Intelligent Customer Support**: By combining knowledge bases with conversational abilities, RAG powers chatbots and support systems that can maintain context across chat history, provide accurate responses, and handle complex customer queries while reducing hallucinations.\n", + "4. **Domain-Specific Applications**: RAG allows LLMs to be equipped with specialized knowledge in fields like medicine, law, or engineering by retrieving information from domain-specific literature, regulations, and technical documentation. This enables accurate responses aligned with professional standards and current best practices.\n", + "5. **Code Documentation and Technical Support**: RAG can help developers by retrieving relevant code examples, API documentation, and best practices from repositories and documentation, which often suffer updates frequently, enabling more accurate and contextual coding assistance.\n", "\n", - "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "If LLMs alone work on stale, general-purpose data with the added challenge of being prone to hallucinations, RAG systems serve as an added capability enabling LLMs to work on recent, domain-specific knowledge increasing the likelihood of LLMs to provide responses that are factual and relevant to user queries.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RAG Pipeline\n", "\n", - "Let's examine an example implementation of this technique.\n", + "RAG architectures vary but they all share the same goal: to retrieve relevant information from a knowledge base to maximize the LLM's ability to effectively and accurately respond to prompts, particularly when the answer requires out-of-training data information.\n", "\n", - "#### Generating long-form content\n", + "We will introduce key components of a RAG system one by one leading to a full canonical RAG pipeline at the end that ultimately will be used to answer our original question \"Who's the author of the book Taming LLMs?\", accurately.\n", "\n", - "- Goal: Generate a long-form report analyzing a company's financial statement.\n", - "- Input: A company's 10K SEC filing.\n", + "The following basic components will be introduced (see {numref}`rag_pipeline` for a visual representation):\n", + "- Vector Database\n", + " - Embeddings\n", + " - Indexing\n", + "- Retrieval System including re-ranking\n", + "- LLM Augmented Generation via in-context learning\n", "\n", - "```{figure} ../_static/structured_output/diagram1.png\n", + "Data extraction, parsing and chunking are also part of a canonical pipeline as we prepare the knowledge base. Those are concepts that we have already explored in the previous sections, hence we will be succinct here. We will start by preparing the knowledge base.\n", + "\n", + "```{figure} ../_static/input/rag.svg\n", "---\n", - "name: content-chunking-with-contextual-linking\n", - "alt: Content Chunking with Contextual Linking\n", - "scale: 50%\n", + "name: rag_pipeline\n", + "alt: RAG Pipeline\n", + "scale: 99%\n", "align: center\n", "---\n", - "Content Chunking with Contextual Linking Schematic Representation.\n", - "```\n", - "\n", - "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "Simplified RAG Pipeline\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preparing the Knowledge Base\n", "\n", - "**Step 1: Chunking the Content**\n", + "Every RAG system requires a knowledge base. In our case, the knowledge base is a set of documents that we equip the LLM to answer our authorship question.\n", "\n", - "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", - "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", - "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", - " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", - " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", - " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "Hence, we will compose our knowledge base by adding the web version of (some of the chapters of) the book \"Taming LLMs\", namely:\n", + "- Introduction\n", + "- Structured Output\n", + "- Input (this very chapter)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "book_url = \"https://www.tamingllms.com/\"\n", + "chapters = [\"markdown/intro.html\",\n", + " \"notebooks/structured_output.html\",\n", + " \"notebooks/input.html\"]\n", "\n", - " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + "chapter_urls = [f\"{book_url}/{chapter}\" for chapter in chapters]\n", + "chapter_ids = [chapter.split(\"/\")[-1].replace(\".html\", \"\") for chapter in chapters]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `Docling` to download the chapters from the web and parse them as markdown files." ] }, { @@ -1679,36 +1755,57 @@ "metadata": {}, "outputs": [], "source": [ - "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", - " \"\"\"\n", - " Split input text into chunks of specified size with specified overlap.\n", + "chapters = [converter.convert(chapter_url).document.export_to_markdown() for chapter_url in chapter_urls]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to store the chapters in a vector database to enable the construction of a retrieval system." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Vector Database\n", "\n", - " Args:\n", - " text (str): The input text to be chunked.\n", - " chunk_size (int): The maximum size of each chunk in tokens.\n", - " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "Vector databases are specialized databases designed to store and retrieve high-dimensional vectors, which are mathematical representations of data like text, images, or audio. These databases are optimized for similarity search operations, making them ideal for embeddings-based retrieval systems.\n", "\n", - " Returns:\n", - " list: A list of text chunks.\n", - " \"\"\"\n", - " from langchain_text_splitters import CharacterTextSplitter\n", + "A typical pipeline involving a vector database includes the following:\n", "\n", - " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", - " return text_splitter.split_text(text)\n" + "1. Input data is converted into \"documents\" forming a collection representing our knowledge base\n", + "2. Each document is converted into an embedding which are stored in the vector database\n", + "3. Embeddings are indexed in the vector database for efficient similarity search\n", + "4. The vector database is queried to retrieve the most relevant documents\n", + "5. The retrieved documents are used to answer questions\n", + "\n", + "Vector databases are not a mandatory component of RAG systems. In fact, we can use a simple list of strings to store the chapters (or their chunks) and then use the LLM to answer questions about the document. However, vector databases are useful for RAG applications as they enable:\n", + "- Fast similarity search for finding relevant context\n", + "- Efficient storage of document embeddings\n", + "- Scalable retrieval for large document collections\n", + "- Flexible querying with metadata filters\n", + "\n", + "In that way, RAG applications can be seen as a retrieval system that uses a vector database to store and retrieve embeddings of documents, which in turn are used to augment LLMs with contextually relevant information as we will see in the next sections.\n", + "\n", + "Here, we will use ChromaDB {cite}`chromadb2024docs` as an example of an open source vector database but key features and concepts we cover are applicable to other vector databases, in general.\n", + "\n", + "ChromaDB is a popular open-source vector database that offers:\n", + "- Efficient storage and retrieval of embeddings\n", + "- Support for metadata and filtering\n", + "- Easy integration with Python applications\n", + "- In-memory and persistent storage options\n", + "- Support for multiple distance metrics\n", + "\n", + "Other notable vector databases include Weaviate, FAISS, and Milvus." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 2: Writing the Base Prompt Template**\n", - "\n", - "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", - "- `role`: Defines the role or persona the model should assume.\n", - "- `context`: Provides the background information or context for the task.\n", - "- `instruction`: Specifies the task or action the model needs to perform.\n", - "- `input_text`: Contains the actual text input that the model will process.\n", - "- `requirements`: Lists any specific requirements or constraints for the output." + "In ChromaDB, we can create a vector database client as follows." ] }, { @@ -1717,26 +1814,17 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.prompts import PromptTemplate\n", - "def get_base_prompt_template() -> str:\n", - " \n", - " base_prompt = \"\"\"\n", - " ROLE: {role}\n", - " CONTEXT: {context}\n", - " INSTRUCTION: {instruction}\n", - " INPUT: {input}\n", - " REQUIREMENTS: {requirements}\n", - " \"\"\"\n", - " \n", - " prompt = PromptTemplate.from_template(base_prompt)\n", - " return prompt" + "import chromadb\n", + "chroma_client = chromadb.Client()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + "This will create a vector database in memory. We can also create a persistent vector database by specifying a path to a directory or alternatively by using a cloud-based vector database service like AWS, Azure or GCP. We will use a vector database in memory for this example.\n", + "\n", + "Next, we create a collection to store the embeddings of the chapters. And add our chapters as documents to the collection as follows." ] }, { @@ -1745,45 +1833,19 @@ "metadata": {}, "outputs": [], "source": [ - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_community.chat_models import ChatLiteLLM\n", + "collection = chroma_client.create_collection(name=\"taming_llms\")\n", "\n", - "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", - " \"\"\"\n", - " Returns an LLMChain instance using langchain.\n", - "\n", - " Args:\n", - " prompt_template (str): The prompt template to use.\n", - " model_name (str): The name of the model to use.\n", - " temperature (float): The temperature setting for the model.\n", - "\n", - " Returns:\n", - " llm_chain: An instance of the LLMChain.\n", - " \"\"\"\n", - " \n", - " from dotenv import load_dotenv\n", - " import os\n", - "\n", - " # Load environment variables from .env file\n", - " load_dotenv()\n", - " \n", - " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", - " llm = ChatLiteLLM(\n", - " model=model_name,\n", - " temperature=temperature,\n", - " api_key=os.environ[api_key_label],\n", - " )\n", - " llm_chain = prompt_template | llm | StrOutputParser()\n", - " return llm_chain" + "collection.add(\n", + " documents=chapters,\n", + " ids=chapter_ids\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Step 3: Constructing Dynamic Prompt Parameters**\n", - "\n", - "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + "We are ready to query the collection. We write a simple function that takes the collection, input query and number of retrieved results as argument and returns the retrieved documents." ] }, { @@ -1792,59 +1854,19 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Dict\n", - "def get_dynamic_prompt_params(prompt_params: Dict, \n", - " part_idx: int, \n", - " total_parts: int,\n", - " chat_context: str,\n", - " chunk: str) -> str:\n", - " \"\"\"\n", - " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", - " \n", - " Args:\n", - " prompt_params (Dict): Original prompt parameters\n", - " part_idx (int): Index of current conversation part\n", - " total_parts (int): Total number of conversation parts\n", - " chat_context (str): Chat context from previous parts\n", - " chunk (str): Current chunk of text to be processed\n", - " Returns:\n", - " str: Dynamically constructed prompt template with part-specific params\n", - " \"\"\"\n", - " dynamic_prompt_params = prompt_params.copy()\n", - " # saves the chat context from previous parts\n", - " dynamic_prompt_params[\"context\"] = chat_context\n", - " # saves the current chunk of text to be processed as input\n", - " dynamic_prompt_params[\"input\"] = chunk\n", - " \n", - " # Add part-specific instructions\n", - " if part_idx == 0: # Introduction part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the Introduction part of a long report.\n", - " Don't cover any topics yet, just define the scope of the report.\n", - " \"\"\"\n", - " elif part_idx == total_parts - 1: # Conclusion part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating the last part of a long report. \n", - " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", - " \"\"\"\n", - " else: # Main analysis part\n", - " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", - " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", - " For this part, analyze the below INPUT.\n", - " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", - " \"\"\"\n", - " \n", - " return dynamic_prompt_params" + "def query_collection(collection, query_text, n_results=3):\n", + " results = collection.query(\n", + " query_texts=[query_text],\n", + " n_results=n_results\n", + " )\n", + " return results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "**Step 4: Generating the Report**\n", - "\n", - "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + "We write a simple query, enquiring the purpose of the book." ] }, { @@ -1853,24 +1875,907 @@ "metadata": {}, "outputs": [], "source": [ - "def generate_report(input_content: str, llm_model_name: str, \n", - " role: str, requirements: str,\n", - " chunk_size: int, chunk_overlap: int) -> str:\n", - " # stores the parts of the report, each generated by an individual LLM call\n", - " report_parts = [] \n", - " # split the input content into chunks\n", - " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", - " # initialize the chat context with the input content\n", - " chat_context = input_content\n", - " # number of parts to be generated\n", - " num_parts = len(chunks)\n", - "\n", - " prompt_params = {\n", - " \"role\": role, # user-provided\n", - " \"context\": \"\", # dinamically updated per part\n", - " \"instruction\": \"\", # dynamically updated per part\n", - " \"input\": \"\", # dynamically updated per part\n", - " \"requirements\": requirements #user-priovided\n", + "q = \"What is the purpose of this book?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print([['intro', 'input', 'structured_output']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As response, we obtain an object that contains several attributes including:\n", + "- `documents`: The actual documents retrieved from the collection, i.e. the chapters \n", + "- `ids`: The ids of the documents retrieved from the collection\n", + "- `distances`: The distances of the documents to the query vector\n", + "\n", + "We can see that the chapters \"Introduction\", \"Input\" and \"Structured Output\" are retrieved from the collection ordered by their distance to the query vector.\n", + "\n", + "We observe that the Introduction chapter is the most relevant one as it ranks first, followed by the Input and Structured Output chapters. Indeed, the purpose of the book is included in the Introduction chapter demonstrating the retrieval system successfully retrieved the most relevant document to the input query, in this simple example.\n", + "\n", + "In order to understand how the retrieval system works and how the \"distance to the query vector\" is computed, we need to understand how the embeddings are created and how the documents are indexed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Embeddings**\n", + "\n", + "Embeddings are numerical representations of data (including text, images, audio, etc.) that capture meaning, allowing machines to process data quantitatively. Each embedding can be represented as a vector of floating-point numbers such that embedded data with similar meanings produce similar, i.e. close, vectors [^embeddings_definition].\n", + "\n", + "[^embeddings_definition]: Bengio et al. {cite}`bengio2014representationlearningreviewnew` provide serves as an excellent reference for representation learning in general including embeddings. OpenAI provides a good intro to Embeddings for developers {cite}`openai2024embeddings`\n", + "\n", + "For text data, small distances among embeddings suggest high semantic relatedness and large distances suggest low semantic relatedness among the embedded texts. HuggingFace provides a leaderboard of embeddings models {cite}`huggingface2024mteb`, which are ranked by in dimensions such as classification, clustering and reranking performance.\n", + "\n", + "Behind the scenes, ChromaDB is using the model `all-MiniLM-L6-v2` by default [^chroma_embeddings] to create embeddings for the input documents and the query (see {numref}`embedding`). This model is available in `sentence_transformers` {cite}`sentencetransformers2024website`. Let's see how it works.\n", + "\n", + "```{figure} ../_static/input/embedding.svg\n", + "---\n", + "name: embedding\n", + "alt: Embedding\n", + "scale: 70%\n", + "align: center\n", + "---\n", + "Embedding\n", + "```\n", + "\n", + "[^chroma_embeddings]: ChromaDB enables custom embedding functions and provides a list of wrappers around commonly used embedding models and APIs https://docs.trychroma.com/docs/embeddings/embedding-functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "embedding_model = SentenceTransformer('all-MiniLM-L6-v2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We replicate what ChromaDB did by embedding our chapters as well as input query using sentence transformers." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 384)\n" + ] + } + ], + "source": [ + "q = \"What is the purpose of this book?\"\n", + "docs_to_embed = [q] + chapters\n", + "embeddings = embedding_model.encode(docs_to_embed)\n", + "print(embeddings.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain four 384-dimensional vectors representing our embeddings (one for each of the three chapters and one for the input query).\n", + "\n", + "Now we can calculate similarity among the embeddings. By default, sentence transformers uses cosine similarity to calculate the similarity between embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "similarities = embedding_model.similarity(embeddings, embeddings)\n", + "similarities" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "tensor([[1.0000, 0.4402, 0.3022, 0.4028],\n", + " [0.4402, 1.0000, 0.6606, 0.5807],\n", + " [0.3022, 0.6606, 1.0000, 0.6313],\n", + " [0.4028, 0.5807, 0.6313, 1.0000]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize the similarity matrix to better understand the relationships between our documents in {numref}`similarities`. The top row of the matrix represents the similarity of the input query against all chapters. That's exactly what we previously obtained by querying ChromaDB which returned a response with documents ranked by similarity to input query.\n", + "\n", + "```{figure} ../_static/input/similarity.png\n", + "---\n", + "name: similarities\n", + "alt: Similarity matrix heatmap\n", + "scale: 90%\n", + "align: center\n", + "---\n", + "Similarity matrix heatmap showing relationships among query and chapters.\n", + "``` \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculating similarity among embeddings can become computationally intensive if brute force is used, i.e. pair-wise computation, as the number of documents grows in the knowledge base. Indexing is a technique to help address this challenge." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Indexing**\n", + "\n", + "Indexing is a crucial optimization technique that makes similarity searches faster and more efficient.\n", + "\n", + "Without indexing, finding similar vectors would require an exhaustive search - comparing a query vector against every single vector in the database. For large datasets, this becomes prohibitively slow.\n", + "\n", + "Common indexing strategies include:\n", + "\n", + "1. **Tree-based Indexes**\n", + " - Examples include KD-trees and Ball trees\n", + " - Work by partitioning the vector space into hierarchical regions\n", + " - Effective for low-dimensional data but suffer from the \"curse of dimensionality\"\n", + "\n", + "2. **Graph-based Indexes**\n", + " - HNSW (Hierarchical Navigable Small World) is a prominent example\n", + " - Creates a multi-layered graph structure for navigation\n", + " - Offers excellent search speed but requires more memory\n", + "\n", + "3. **LSH (Locality-Sensitive Hashing)**\n", + " - Uses hash functions that map similar vectors to the same buckets\n", + " - More memory-efficient than graph-based methods\n", + " - May sacrifice some accuracy for performance\n", + "\n", + "4. **Quantization-based Indexes**\n", + " - Product Quantization compresses vectors by encoding them into discrete values\n", + " - Reduces memory footprint significantly\n", + " - Good balance between accuracy and resource usage\n", + "\n", + "HNSW is the underlying library for Chroma vector indexing and search {cite}`chromadb2024hnsw`. HNSW provides fast searches with high accuracy but uses more memory. LSH and quantization methods offer better memory efficiency but may sacrifice some precision.\n", + "\n", + "But are indexing + basic embeddings based similarity sufficient? Often not, as we will see next as we cover reranking technique." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reranking\n", + "\n", + "Let's go back to querying our vector database. Here are additional examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we write a query about how to get structured output from LLMs. Successfully retrieving the \"Structured Output\" chapter from the book as top result." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['structured_output', 'input', 'intro']]\n" + ] + } + ], + "source": [ + "q = \"How to get structured output from LLMs?\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we would like to obtain a tutorial on `Docling`, a tool we covered in this very chapter. However, we fail to obtain the correct chapter and instead obtain the \"Introduction\" chapter as a result." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['intro', 'input', 'structured_output']]\n" + ] + } + ], + "source": [ + "q = \"Docling tutorial\"\n", + "res = query_collection(collection, q)\n", + "res.get(\"ids\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval systems solely based on vector similarity search might miss semantic relevance. That brings the need for techniques that can improve accuracy of the retrieval system. One such technique is re-ranking.\n", + "\n", + "Re-ranking is a method that can improve accuracy of the retrieval system by re-ranking the retrieved documents based on their relevance to the input query.\n", + "\n", + "In the following, we will use the `sentence_transformers` library to re-rank the retrieved documents based on their relevance to the input query. We utilize the `CrossEncoder` model to re-rank the documents. Cross-Encoder models are more accurate at judging relevance at the cost of speed compared to basic vector-based similarity. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can implement a reranking step in a RAG system using a Cross-Encoder model in the following steps:\n", + "\n", + "1. First, we initialize the Cross-Encoder model:\n", + "```python\n", + "model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)\n", + "```\n", + "- Uses the `ms-marco-MiniLM-L-6-v2` model, which is specifically trained for passage reranking\n", + "- Sets a maximum sequence length of 512 tokens\n", + "- This model is designed to score the relevance between query-document pairs\n", + "\n", + "2. Then we perform the reranking:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "- Creates pairs of (query, document) for each retrieved document\n", + "- The model predicts relevance scores for each pair\n", + "- Higher scores indicate better semantic match between query and document\n", + "\n", + "3. Finally, we select the best match:\n", + "```python\n", + "print(res[\"documents\"][0][np.argmax(scores)])\n", + "```\n", + "- `np.argmax(scores)` finds the index of the highest scoring document\n", + "- Uses that index to retrieve the most relevant document\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We obtain the following scores for the retrieved documents (\"intro\", \"input\", \"structured_output\"), the higher the score, the more relevant the document is in relation to the input query.\n", + "\n", + "```\n", + "array([-8.52623 , -6.328738, -8.750055], dtype=float32)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As a result, we obtain the index of the highest scoring document, which corresponds to the \"input\" chapter. Hence, the re-ranking step successfully retrieved the correct chapter." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input\n" + ] + } + ], + "source": [ + "print(res[\"ids\"][0][np.argmax(scores)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ideia is to first run semantic similarity on embeddings, which should be fast but potentially inaccurate, and then run re-raking on the top-k results, which is more accurate but slower. By doing so, we can balance the speed and accuracy of the retrieval system.\n", + "\n", + "Hence, instead of going over all retrieved documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0]])\n", + "```\n", + "We would run reranking on the TOPK results, where TOPK <<< number of documents:\n", + "```python\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### LLMs with RAG\n", + "\n", + "We are finally ready to use the retrieval system to help the LLM answer our authorship question. A common way to integrate RAGs with LLMs is via in-context learning. With in-context learning the LLM learns from the retrieved documents by providing them in the context window as represented in {numref}`incontext`. This is accomplished via a prompt template structure as follows.\n", + "\n", + "```{figure} ../_static/input/incontext.svg\n", + "---\n", + "name: incontext\n", + "alt: In-Context Learning\n", + "scale: 95%\n", + "align: center\n", + "---\n", + "RAG LLM with In-Context Learning\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + " rag_system_prompt_template = f\"\"\"\n", + " You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + "\n", + " user_prompt_template = f\"\"\"\n", + " QUESTION: {input}\n", + " \"\"\"\n", + "```\n", + "\n", + "This prompt strategy demonstrates a common in-context learning pattern where retrieved documents are incorporated into the LLM's context to enhance response accuracy and relevance. The prompt structure typically consists of a system prompt that:\n", + "- Sets clear boundaries for the LLM to use information from the provided context\n", + "- Includes the retrieved documents as context\n", + "\n", + "This approach:\n", + "- Reduces hallucination by grounding responses in source documents\n", + "- Improves answer relevance by providing contextually relevant information to the LLM\n", + "\n", + "The context variable is typically populated with the highest-scoring document(s) from the retrieval step, while the input variable contains the user's original query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def RAG_qa(client, model, context, input):\n", + " \"\"\"\n", + " Generate a summary of input using a given model\n", + " \"\"\"\n", + " rag_system_prompt_template = f\"\"\"You are a helpful assistant that answers questions based on the provided CONTEXT.\n", + "\n", + " CONTEXT: {context}\n", + " \"\"\"\n", + " \n", + " response = client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": \"system\", \"content\": rag_system_prompt_template},\n", + " {\"role\": \"user\", \"content\": f\"QUESTION: {input}\"}]\n", + " )\n", + " return response.choices[0].message.content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we set the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", + "\n", + "from openai import OpenAI\n", + "client = OpenAI()\n", + "model = \"gpt-4o-mini\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, we run the retrieve step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = query_collection(collection, q)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we run the re-ranking step setting it to consider the `TOPK` retrieved documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TOPK = 2\n", + "scores = model.predict([(q, doc) for doc in res[\"documents\"][0][:TOPK]])\n", + "res_reranked = res[\"documents\"][0][np.argmax(scores)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then pass the top document as context and invoke the LLM with our RAG-based template leading to a successful response." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The author of the book \"Taming LLMs\" is Tharsis Souza.\n" + ] + } + ], + "source": [ + "answer = RAG_qa(model, res_reranked[0], question)\n", + "answer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, we motivated the use of RAGs as a tool to equip LLMs with relevant context and provided a canonical implementation of its core components. RAGs, however, can be implemented in many shapes and forms and entire books have been written about them. We point the user to additional resources if more specialized techniques and architectures are needed {cite}`kimothi2024simpleguiderag, athinaai2024ragcookbooks, diamant2024ragtechniques, hands-on-llms-book`.\n", + "\n", + "Next, we discuss RAGs challenges and limitations and conclude our RAGs section envisioning the future of RAGs challenged by the rise of long-context language models.\n", + "\n", + "### Challenges and Limitations\n", + "\n", + "While RAG systems offer powerful capabilities for enhancing LLM responses with external knowledge, they face several significant challenges and limitations that require careful consideration:\n", + " \n", + "- **Data Quality and Accuracy**: The effectiveness of RAG systems fundamentally depends on the quality and reliability of their knowledge sources. When these sources contain inaccurate, outdated, biased, or incomplete information, the system's responses become unreliable. This challenge is particularly acute when dealing with rapidly evolving topics or when sourcing information from unverified channels.\n", + " \n", + "- **Computational Cost and Latency**: Implementing RAG systems at scale presents computational and operational challenges. The process of embedding documents, maintaining vector databases, and performing similarity searches across large knowledge bases demands computational, budget and operational resources. In real-time applications, these requirements can introduce noticeable latency, potentially degrading the user experience and limiting practical applications.\n", + " \n", + "- **Explainability and Evaluation**: The complexity of RAG systems, arising from the intricate interaction between retrieval mechanisms and generative models, makes it difficult to trace and explain their reasoning processes. Traditional evaluation metrics often fail to capture the nuanced aspects of RAG performance, such as contextual relevance and factual consistency. This limitation hampers both system improvement and stakeholder trust. Readers are encouraged to read Chapter {ref}`evals` for general LLM evaluation issues as well as consider tools such as Ragas {cite}`ragas2024evaluation` for RAG evaluation.\n", + " \n", + "- **Hallucination Management**: Though RAG systems help ground LLM responses in source documents, they do not completely eliminate hallucinations. The generative component may still produce content that extrapolates beyond or misinterprets the retrieved context. This risk becomes particularly concerning when the system confidently presents incorrect information with apparent source attribution.\n", + "\n", + "\n", + "Moreover, recent research has shed light on critical limitations of key techniques used in RAGs systems. A relevant finding pertains to reranking, which has shown {cite}`jacob2024drowningdocumentsconsequencesscaling`:\n", + "\n", + "- **Diminishing Returns**: Performance degrades as the number of documents (K) increases, sometimes performing worse than basic retrievers when dealing with large datasets.\n", + "- **Poor Document Discrimination**: Rerankers can be misled by irrelevant documents, sometimes assigning high scores to content with minimal relevance to the query.\n", + "- **Consistency Issues**: Performance and relative rankings between different rerankers can vary significantly depending on the number of documents being processed.\n", + "\n", + "### Will RAGs exist in the future?\n", + "\n", + "This question is posed as we contrast RAGs with LLMs with long-context windows (LC).\n", + "\n", + "Recent research has shed light on this specific point {cite}`li2024retrievalaugmentedgenerationlongcontext`, suggesting that, on the one hand, RAGs can be seen as a cost-effective alternative to LC models:\n", + "* RAGs offer lower computational cost compared to LC due to the significantly shorter input length required for processing.\n", + "* This cost-efficiency arises because RAG reduces the number of input tokens to LLMs, which of course reduces usage cost as pricing is based on the number of input (and output) tokens.\n", + "\n", + "On the other hand, this RAG benefit is achieved at the cost of performance:\n", + "* Recent advancements in LLMs, in particular with Gemini-1.5 and GPT-4o models, demonstrate capabilities in understanding long contexts directly, which enables them to outperform RAG in terms of average performance\n", + "* LC models can process extremely long contexts, such as Gemini 1.5 which can handle up to 1 million tokens, and these models benefit from large-scale pretraining to develop strong long-context capabilities.\n", + "\n", + "This cost-performance trade-off is illustrated in {numref}`LC`, where LC models outperform RAGs in terms of average performance while RAGs are more cost-effective.\n", + "\n", + "```{figure} ../_static/input/LC.png\n", + "---\n", + "name: LC\n", + "alt: Long-Context LLMs for Superior Performance\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Long-Context LLMs demonstrate superior performance while RAGs are more cost-effective {cite}`li2024retrievalaugmentedgenerationlongcontext`.\n", + "```\n", + "\n", + "{numref}`LC` also shows a model called \"SELF-ROUTE\" which combines RAG and LC by routing queries based on model self-reflection. This is a hybrid approach that reduces computational costs while maintaining performance comparable to LC. The advantage of SELF-ROUTE is most significant for smaller values of *k*, where *k* is the number of retrieved text chunks, and SELF-ROUTE shows a marked improvement in performance over RAG, while as k increases the performance of RAG and SELF-ROUTE approaches that of LC.\n", + "\n", + "Another example of a hybrid approach that combines the benefits of both LC and RAGs is RetroLLM {cite}`li2024retrollmempoweringlargelanguage`, which is a unified framework that integrates retrieval and generation into a single process, enabling language models to generate fine-grained evidence directly from a corpus. The key contribution is that this approach delivers those benefits while eliminating the need for a separate retriever, addressing limitations of traditional RAG methods. Experimental results demonstrate RetroLLM's superior performance compared to traditional RAG methods, across both in-domain and out-of-domain tasks. It also achieves a significant reduction in token consumption due to its fine-grained evidence retrieval.\n", + "\n", + "A relevant development in this area is the introduction of LOFT {cite}`lee2024longcontextlanguagemodelssubsume`, a benchmark to assess this paradigm shift from RAGs to LCs, using real-world tasks requiring context up to millions of tokens. Evidence suggests LCs can deliver performance with simplified pipelines compared to RAGs, particularly for tasking requiring multi-hop reasoning over long contexts when using Chain-of-Thought {cite}`wei2023chainofthoughtpromptingelicitsreasoning`. However, LCs can still be outperformed by specialized retrievers, in particular Gecko, a specialized model fine-tuned on extensive text retrieval and similarity tasks.\n", + "\n", + "Bottom-line: Do we really need RAGs? The answer is conditional:\n", + "\n", + "* **RAG may be relevant when cost-effectiveness is a key requirement** and where the model needs to access vast amounts of external knowledge without incurring high computational expenses. However, as LLMs context window sizes increase and LLMs cost per input token is decreases, RAG may not be as relevant as it was before.\n", + "* **Long-context LLMs are superior when performance is the primary concern**, and the model needs to handle extensive texts that require deep contextual understanding and reasoning.\n", + "* **Hybrid approaches like SELF-ROUTE are valuable as they combine the strengths of RAG and LC** offering a practical balance between cost and performance, especially for applications where both factors are critical.\n", + "\n", + "Ultimately, the choice between RAG, LC, or a hybrid method depends on the specific requirements of the task, available resources, and the acceptable trade-off between cost and performance.\n", + "\n", + "In a later case study, we demonstrate the power of LCs as we construct a Quiz generator with citations over a large knowledge base without the use of chunking nor RAGs.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A Note on Frameworks\n", + "\n", + "We have covered a few open source tools for parsing data and provided a canonical RAG pipeline directly using an open source VectorDB together with an LLM. There is a growing number of frameworks that offer similar functionality wrapping the same core concepts at a higher level of abstraction. The two most popular ones are `Langchain` and `LlamaIndex`. \n", + "\n", + "For instance, the code below shows how to use `LlamaIndex`'s `LlamaParse` for parsing input documents, which offers support for a wide range of file formats (e.g. .pdf, .pptx, .docx, .xlsx, .html). We we can see that the code is very similar to the one we used for `MarkitDown` and `Docling`.\n", + "\n", + "```python\n", + "from llama_parse import LlamaParse\n", + "\n", + "# Initialize the parser\n", + "parser = LlamaParse(\n", + " api_key=\"llx-your-api-key-here\",\n", + " result_type=\"markdown\", # Can be \"markdown\" or \"text\"\n", + " verbose=True\n", + ")\n", + "\n", + "documents = parser.load_data([\"./doc1.pdf\", \"./doc2.pdf\"])\n", + "```\n", + "\n", + "\n", + "\n", + "As another example, the code below replicates our ChromaDB-based retrieval system using `LlamaIndex` {cite}`llamaindex2024storing`.\n", + "\n", + "As we can see, similar concepts are used in both frameworks:\n", + "- Documents to represent elements of the knowledge base\n", + "- Collections to store the documents\n", + "- Indexing of embeddings in the VectorDB and finally\n", + "- Querying the VectorDB to retrieve the documents\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import chromadb\n", + "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", + "from llama_index.vector_stores.chroma import ChromaVectorStore\n", + "from llama_index.core import StorageContext\n", + "\n", + "# load some documents\n", + "documents = SimpleDirectoryReader(\"./data\").load_data()\n", + "\n", + "# initialize client, setting path to save data\n", + "db = chromadb.PersistentClient(path=\"./chroma_db\")\n", + "\n", + "# create collection\n", + "chroma_collection = db.get_or_create_collection(\"tamingllms\")\n", + "\n", + "# assign chroma as the vector_store to the context\n", + "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", + "\n", + "# create your index\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")\n", + "\n", + "# create a query engine and query\n", + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Who is the author of Taming LLMs?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Frameworks are useful for quickly prototyping RAG systems and for building applications on top of them as they provide a higher level of abstraction and integration with third-party libraries. However, the underlying concepts are the same as the ones we have covered in this chapter. More often than not, problems arise when developers either do not understand the underlying concepts or fail to understand the details of the implement behind the abstractions provided by the framework. Therefore, it is recommended to try and start your implementation using lower level tools as much as possible and only when (i) the underlying problem as well as (ii) the desired solution are well understood, then consider moving to higher level frameworks if really needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Studies\n", + "\n", + "This section presents two case studies to complement topics we have covered in this chapter in the context of managing input data for LLMs.\n", + "\n", + "First, we cover content chunking, in particular Content Chunking with Contextual Linking which showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "\n", + "Second, we build a Quiz generator with citations using long context window. Not all knowledge intense applications require RAGs. In this case study, we show how to use long context window as well as some additional input management techniques such as prompt caching for efficiency and reference management to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study I: Content Chunking with Contextual Linking\n", + "\n", + "Content chunking is commonly used to breakdown long-form content into smaller, manageable chunks. In the context of RAGs, this can be helpful not only to help the retrieval system find more contextually relevant documents but also lead to a more cost efficient LLM solution since fewer tokens are processed in the context window. Furthermore, semantic chunking can increase accuracy of RAG systems {cite}`zenml2024rag`.\n", + "\n", + "Content chunking with contextual linking is a chunking technique that seeks to split input content while keeping chunk-specific context, hence allowing the LLM to maintain coherence and context when generating responses per chunks. In that way, this technique tackles two key problems:\n", + "1. The LLM's inability to process long inputs to do context-size limits\n", + "2. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "\n", + "As a consequence, a third problem is also tackled: LLM's inability to generate long-form content due to the `max_output_tokens` limitation. Since we generate responses per chunk, as we will see later, we end up with a solution that is capable of generating long-form content while maintaining coherence.\n", + "\n", + "We exemplify this technique by following these steps:\n", + "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "\n", + "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "\n", + "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "\n", + "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "\n", + "Let's examine an example implementation of this technique.\n", + "\n", + "#### Generating long-form content\n", + "\n", + "- Goal: Generate a long-form report analyzing a company's financial statement.\n", + "- Input: A company's 10K SEC filing.\n", + "\n", + "```{figure} ../_static/structured_output/diagram1.png\n", + "---\n", + "name: content-chunking-with-contextual-linking\n", + "alt: Content Chunking with Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Content Chunking with Contextual Linking Schematic Representation.\n", + "```\n", + "\n", + "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "\n", + "**Step 1: Chunking the Content**\n", + "\n", + "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", + "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", + "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", + " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", + " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", + " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "\n", + " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", + " \"\"\"\n", + " Split input text into chunks of specified size with specified overlap.\n", + "\n", + " Args:\n", + " text (str): The input text to be chunked.\n", + " chunk_size (int): The maximum size of each chunk in tokens.\n", + " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "\n", + " Returns:\n", + " list: A list of text chunks.\n", + " \"\"\"\n", + " from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " return text_splitter.split_text(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 2: Writing the Base Prompt Template**\n", + "\n", + "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", + "- `role`: Defines the role or persona the model should assume.\n", + "- `context`: Provides the background information or context for the task.\n", + "- `instruction`: Specifies the task or action the model needs to perform.\n", + "- `input_text`: Contains the actual text input that the model will process.\n", + "- `requirements`: Lists any specific requirements or constraints for the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.prompts import PromptTemplate\n", + "def get_base_prompt_template() -> str:\n", + " \n", + " base_prompt = \"\"\"\n", + " ROLE: {role}\n", + " CONTEXT: {context}\n", + " INSTRUCTION: {instruction}\n", + " INPUT: {input}\n", + " REQUIREMENTS: {requirements}\n", + " \"\"\"\n", + " \n", + " prompt = PromptTemplate.from_template(base_prompt)\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_community.chat_models import ChatLiteLLM\n", + "\n", + "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", + " \"\"\"\n", + " Returns an LLMChain instance using langchain.\n", + "\n", + " Args:\n", + " prompt_template (str): The prompt template to use.\n", + " model_name (str): The name of the model to use.\n", + " temperature (float): The temperature setting for the model.\n", + "\n", + " Returns:\n", + " llm_chain: An instance of the LLMChain.\n", + " \"\"\"\n", + " \n", + " from dotenv import load_dotenv\n", + " import os\n", + "\n", + " # Load environment variables from .env file\n", + " load_dotenv()\n", + " \n", + " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", + " llm = ChatLiteLLM(\n", + " model=model_name,\n", + " temperature=temperature,\n", + " api_key=os.environ[api_key_label],\n", + " )\n", + " llm_chain = prompt_template | llm | StrOutputParser()\n", + " return llm_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 3: Constructing Dynamic Prompt Parameters**\n", + "\n", + "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "def get_dynamic_prompt_params(prompt_params: Dict, \n", + " part_idx: int, \n", + " total_parts: int,\n", + " chat_context: str,\n", + " chunk: str) -> str:\n", + " \"\"\"\n", + " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", + " \n", + " Args:\n", + " prompt_params (Dict): Original prompt parameters\n", + " part_idx (int): Index of current conversation part\n", + " total_parts (int): Total number of conversation parts\n", + " chat_context (str): Chat context from previous parts\n", + " chunk (str): Current chunk of text to be processed\n", + " Returns:\n", + " str: Dynamically constructed prompt template with part-specific params\n", + " \"\"\"\n", + " dynamic_prompt_params = prompt_params.copy()\n", + " # saves the chat context from previous parts\n", + " dynamic_prompt_params[\"context\"] = chat_context\n", + " # saves the current chunk of text to be processed as input\n", + " dynamic_prompt_params[\"input\"] = chunk\n", + " \n", + " # Add part-specific instructions\n", + " if part_idx == 0: # Introduction part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the Introduction part of a long report.\n", + " Don't cover any topics yet, just define the scope of the report.\n", + " \"\"\"\n", + " elif part_idx == total_parts - 1: # Conclusion part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the last part of a long report. \n", + " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", + " \"\"\"\n", + " else: # Main analysis part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", + " For this part, analyze the below INPUT.\n", + " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", + " \"\"\"\n", + " \n", + " return dynamic_prompt_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Step 4: Generating the Report**\n", + "\n", + "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_report(input_content: str, llm_model_name: str, \n", + " role: str, requirements: str,\n", + " chunk_size: int, chunk_overlap: int) -> str:\n", + " # stores the parts of the report, each generated by an individual LLM call\n", + " report_parts = [] \n", + " # split the input content into chunks\n", + " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", + " # initialize the chat context with the input content\n", + " chat_context = input_content\n", + " # number of parts to be generated\n", + " num_parts = len(chunks)\n", + "\n", + " prompt_params = {\n", + " \"role\": role, # user-provided\n", + " \"context\": \"\", # dinamically updated per part\n", + " \"instruction\": \"\", # dynamically updated per part\n", + " \"input\": \"\", # dynamically updated per part\n", + " \"requirements\": requirements #user-priovided\n", " }\n", "\n", " # get the LLMChain with the base prompt template\n", @@ -2076,14 +2981,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study II: Github RAG\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Case Study III: Quiz Generation with Citations\n", + "### Case Study II: Quiz Generation with Citations\n", "\n", "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n", "\n", @@ -2400,7 +3298,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Conclusion" + "## Conclusion\n", + "\n", + "This chapter has explored critical strategies and techniques for managing input data in LLM applications, focusing on three key areas: data parsing, retrieval augmentation, and practical implementation patterns. We examined how parsing tools like MarkItDown and Docling can transform diverse data formats into LLM-compatible representations, demonstrating through case studies how parser quality can impact LLM performance. The chapter also investigated retrieval augmentation techniques, particularly RAG systems, showing how they can enhance LLM capabilities by providing access to external knowledge while discussing their future relevance in the context of emerging long-context language models.\n", + "\n", + "Through our case studies, we demonstrated practical approaches to handling common challenges in LLM applications. The Content Chunking with Contextual Linking case study illustrated techniques for managing long-form content generation while maintaining coherence across chunks. The Quiz Generation with Citations case study showcased how long-context windows can be effectively utilized without the need for complex retrieval systems, highlighting the importance of choosing the right approach based on specific application requirements rather than defaulting to more complex solutions.\n", + "\n", + "As the field continues to evolve, the choice between traditional RAG systems and emerging long-context models will likely become increasingly nuanced. While RAGs offer cost-effective solutions for incorporating external knowledge, the rise of long-context models suggests a future where simpler architectures might suffice for many applications. The key insight is that effective input data management requires careful consideration of trade-offs among complexity, cost, and performance, always guided by specific application requirements rather than following a one-size-fits-all approach. Success in building robust LLM applications will depend on understanding these trade-offs and selecting appropriate strategies for each use case." ] }, { diff --git a/tamingllms/references.bib b/tamingllms/references.bib index 75c09ae..feadf6a 100644 --- a/tamingllms/references.bib +++ b/tamingllms/references.bib @@ -1,3 +1,233 @@ +@misc{KOMAScriptDoc, + title = {{KOMA-Script Documentation Project {$\vert$} Aktive Anwender verbessern KOMA-Script.}}, + year = {2019}, + month = {Feb}, + note = {[Online; accessed 18. Feb. 2019]}, + url = {https://komascript.de} +} +@book{James2013, +abstract = {... But ESL is intended for individuals with ad- vanced training in the mathematical sciences. An Introduction to Statistical Learning (ISL) arose from the perceived need for a broader and less tech- nical treatment of these topics. ... $\backslash$n}, +archivePrefix = {arXiv}, +arxivId = {arXiv:1011.1669v3}, +author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert}, +booktitle = {An Introd. to Stat. Learn.}, +doi = {10.1007/978-1-4614-7138-7}, +eprint = {arXiv:1011.1669v3}, +isbn = {978-1-4614-7137-0}, +issn = {01621459}, +pmid = {10911016}, +title = {{An Introduction to Statistical Learning}}, +url = {http://www.google.com}, +year = {2013} +} +@article{Battle2014, +abstract = {Understanding the consequences of regulatory variation in the human genome remains a major challenge, with important implications for understanding gene regulation and interpreting the many disease-risk variants that fall outside of protein-coding regions. Here, we provide a direct window into the regulatory consequences of genetic variation by sequencing RNA from 922 genotyped individuals. We present a comprehensive description of the distribution of regulatory variation--by the specific expression phenotypes altered, the properties of affected genes, and the genomic characteristics of regulatory variants. We detect variants influencing expression of over ten thousand genes, and through the enhanced resolution offered by RNA-sequencing, for the first time we identify thousands of variants associated with specific phenotypes including splicing and allelic expression. Evaluating the effects of both long-range intra-chromosomal and trans (cross-chromosomal) regulation, we observe modularity in the regulatory network, with three-dimensional chromosomal configuration playing a particular role in regulatory modules within each chromosome. We also observe a significant depletion of regulatory variants affecting central and critical genes, along with a trend of reduced effect sizes as variant frequency increases, providing evidence that purifying selection and buffering have limited the deleterious impact of regulatory variation on the cell. Further, generalizing beyond observed variants, we have analyzed the genomic properties of variants associated with expression and splicing and developed a Bayesian model to predict regulatory consequences of genetic variants, applicable to the interpretation of individual genomes and disease studies. Together, these results represent a critical step toward characterizing the complete landscape of human regulatory variation.}, +archivePrefix = {arXiv}, +arxivId = {NIHMS150003}, +author = {Battle, Alexis and Mostafavi, Sara and Zhu, Xiaowei and Potash, James B. and Weissman, Myrna M. and McCormick, Courtney and Haudenschild, Christian D. and Beckman, Kenneth B. and Shi, Jianxin and Mei, Rui and Urban, Alexander E. and Montgomery, Stephen B. and Levinson, Douglas F. and Koller, Daphne}, +doi = {10.1101/gr.155192.113}, +eprint = {NIHMS150003}, +journal = {Genome Res.}, +number = {1}, +pages = {14--24}, +pmid = {24092820}, +title = {{Characterizing the genetic basis of transcriptome diversity through RNA-sequencing of 922 individuals}}, +volume = {24}, +year = {2014} +} + +@misc{llamaparse2024github, + title={LlamaParse: Extract structured data from text and PDFs using LLMs}, + author={LlamaIndex}, + year={2024}, + url={https://github.com/run-llama/llama_parse}, + note={LlamaParse} +} + + +@article{Zou2005, +abstract = {We propose the elastic net, a new regularization and variable selection method. Real world data and a simulation study show that the elastic net often outperforms the lasso, while enjoying a similar sparsity of representation. In addition, the elastic net encourages a grouping effect, where strongly correlated predictors tend to be in or out of the model together. The elastic net is particularly useful when the number of predictors (p) is much bigger than the number of observations (n). By contrast, the lasso is not a very satisfactory variable selection method in the p≫n case. An algorithm called LARS-EN is proposed for computing elastic net regularization paths efficiently, much like algorithm LARS does for the lasso.}, +author = {Zou, Hui and Hastie, Trevor}, +doi = {10.1111/j.1467-9868.2005.00503.x}, +journal = {J. R. Stat. Soc.}, +keywords = {grouping effect,lars algorithm,lasso,p,penalization}, +number = {2}, +pages = {301--320}, +pmid = {20713001}, +title = {{Regularization and variable selection via the elastic-net}}, +volume = {67}, +year = {2005} +} +@article{Lappalainen2013, +abstract = {Genome sequencing projects are discovering millions of genetic variants in humans, and interpretation of their functional effects is essential for understanding the genetic basis of variation in human traits. Here we report sequencing and deep analysis of messenger RNA and microRNA from lymphoblastoid cell lines of 462 individuals from the 1000 Genomes Project--the first uniformly processed high-throughput RNA-sequencing data from multiple human populations with high-quality genome sequences. We discover extremely widespread genetic variation affecting the regulation of most genes, with transcript structure and expression level variation being equally common but genetically largely independent. Our characterization of causal regulatory variation sheds light on the cellular mechanisms of regulatory and loss-of-function variation, and allows us to infer putative causal variants for dozens of disease-associated loci. Altogether, this study provides a deep understanding of the cellular mechanisms of transcriptome variation and of the landscape of functional variants in the human genome.}, +archivePrefix = {arXiv}, +arxivId = {NIHMS150003}, +author = {Lappalainen, Tuuli and Sammeth, Michael and Friedl{\"{a}}nder, Marc R. and {'T Hoen}, Peter A.C. and Monlong, Jean and Rivas, Manuel A. and Gonz{\`{a}}lez-Porta, Mar and Kurbatova, Natalja and Griebel, Thasso and Ferreira, Pedro G. and Barann, Matthias and Wieland, Thomas and Greger, Liliana and {Van Iterson}, Maarten and Alml{\"{o}}f, Jonas and Ribeca, Paolo and Pulyakhina, Irina and Esser, Daniela and Giger, Thomas and Tikhonov, Andrew and Sultan, Marc and Bertier, Gabrielle and Macarthur, Daniel G. and Lek, Monkol and Lizano, Esther and Buermans, Henk P.J. and Padioleau, Ismael and Schwarzmayr, Thomas and Karlberg, Olof and Ongen, Halit and Kilpinen, Helena and Beltran, Sergi and Gut, Marta and Kahlem, Katja and Amstislavskiy, Vyacheslav and Stegle, Oliver and Pirinen, Matti and Montgomery, Stephen B. and Donnelly, Peter and McCarthy, Mark I. and Flicek, Paul and Strom, Tim M. and Lehrach, Hans and Schreiber, Stefan and Sudbrak, Ralf and Carracedo, {\'{A}}ngel and Antonarakis, Stylianos E. and H{\"{a}}sler, Robert and Syv{\"{a}}nen, Ann Christine and {Van Ommen}, Gert Jan and Brazma, Alvis and Meitinger, Thomas and Rosenstiel, Philip and Guig{\'{o}}, Roderic and Gut, Ivo G. and Estivill, Xavier and Dermitzakis, Emmanouil T.}, +doi = {10.1038/nature12531}, +eprint = {NIHMS150003}, +isbn = {1476-4687 (Electronic)$\backslash$r0028-0836 (Linking)}, +issn = {00280836}, +journal = {Nature}, +number = {7468}, +pages = {506--511}, +pmid = {24037378}, +title = {{Transcriptome and genome sequencing uncovers functional variation in humans}}, +volume = {501}, +year = {2013} +} +@article{ENCODEProjectConsortium2012, +abstract = {The human genome encodes the blueprint of life, but the function of the vast majority of its nearly three billion bases is unknown. The Encyclopedia of DNA Elements (ENCODE) project has systematically mapped regions of transcription, transcription factor association, chromatin structure and histone modification. These data enabled us to assign biochemical functions for 80{\%} of the genome, in particular outside of the well-studied protein-coding regions. Many discovered candidate regulatory elements are physically associated with one another and with expressed genes, providing new insights into the mechanisms of gene regulation. The newly identified elements also show a statistical correspondence to sequence variants linked to human disease, and can thereby guide interpretation of this variation. Overall, the project provides new insights into the organization and regulation of our genes and genome, and is an expansive resource of functional annotations for biomedical research.}, +author = {{ENCODE Project Consortium}, An Integrated Encyclopedia of DNA Elements in the Human}, +doi = {10.1038/nature11247}, +issn = {1476-4687}, +journal = {Nature}, +number = {7414}, +pages = {57--74}, +pmid = {22955616}, +title = {{An integrated encyclopedia of DNA elements in the human genome.}}, +url = {http://www.ncbi.nlm.nih.gov/pubmed/22955616{\%}5Cnhttp://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC3439153}, +volume = {489}, +year = {2012} +} +@article{Gusev2018, +abstract = {Genome-wide association studies (GWAS) have identified over 100 risk loci for schizophrenia, but the causal mechanisms remain largely unknown. We performed a transcriptome-wide association study (TWAS) integrating expression data from brain, blood, and adipose tissues across 3,693 individuals with schizophrenia GWAS of 79,845 individuals from the Psychiatric Genomics Consortium. We identified 157 genes with a transcriptome-wide significant association, of which 35 did not overlap a known GWAS locus; the largest number involved alternative splicing in brain. 42/157 genes were also associated to specific chromatin phenotypes measured in 121 independent samples (a 4-fold enrichment over background genes). This high-throughput connection of GWAS findings to specific genes, tissues, and regulatory mechanisms is an essential step toward understanding the biology of schizophrenia and moving towards therapeutic interventions.}, +author = {Gusev, Alexander and Mancuso, Nicholas and Won, Hyejung and Kousi, Maria and Finucane, Hilary K. and Reshef, Yakir and Song, Lingyun and Safi, Alexias and McCarroll, Steven and Neale, Benjamin M. and Ophoff, Roel A. and O'Donovan, Michael C. and Crawford, Gregory E. and Geschwind, Daniel H. and Katsanis, Nicholas and Sullivan, Patrick F. and Pasaniuc, Bogdan and Price, Alkes L.}, +doi = {10.1038/s41588-018-0092-1}, +isbn = {1546-1718}, +issn = {15461718}, +journal = {Nat. Genet.}, +keywords = {TWAS}, +mendeley-tags = {TWAS}, +number = {4}, +pages = {538--548}, +pmid = {29632383}, +title = {{Transcriptome-wide association study of schizophrenia and chromatin activity yields mechanistic disease insights}}, +volume = {50}, +year = {2018} +} +@article{Gusev2016, +abstract = {Many genetic variants influence complex traits by modulating gene expression, thus altering the abundance of one or multiple proteins. Here we introduce a powerful strategy that integrates gene expression measurements with summary association statistics from large-scale genome-wide association studies (GWAS) to identify genes whose cis-regulated expression is associated with complex traits. We leverage expression imputation from genetic data to perform a transcriptome-wide association study (TWAS) to identify significant expression-trait associations. We applied our approaches to expression data from blood and adipose tissue measured in ∼3,000 individuals overall. We imputed gene expression into GWAS data from over 900,000 phenotype measurements to identify 69 new genes significantly associated with obesity-related traits (BMI, lipids and height). Many of these genes are associated with relevant phenotypes in the Hybrid Mouse Diversity Panel. Our results showcase the power of integrating genotype, gene expression and phenotype to gain insights into the genetic basis of complex traits.}, +archivePrefix = {arXiv}, +arxivId = {15334406}, +author = {Gusev, Alexander and Ko, Arthur and Shi, Huwenbo and Bhatia, Gaurav and Chung, Wonil and Penninx, Brenda W.J.H. and Jansen, Rick and {De Geus}, Eco J.C. and Boomsma, Dorret I. and Wright, Fred A. and Sullivan, Patrick F. and Nikkola, Elina and Alvarez, Marcus and Civelek, Mete and Lusis, Aldons J. and Lehtim{\"{a}}ki, Terho and Raitoharju, Emma and K{\"{a}}h{\"{o}}nen, Mika and Sepp{\"{a}}l{\"{a}}, Ilkka and Raitakari, Olli T. and Kuusisto, Johanna and Laakso, Markku and Price, Alkes L. and Pajukanta, P{\"{a}}ivi and Pasaniuc, Bogdan}, +doi = {10.1038/ng.3506}, +eprint = {15334406}, +isbn = {1061-4036}, +issn = {15461718}, +journal = {Nat. Genet.}, +keywords = {TWAS}, +mendeley-tags = {TWAS}, +month = {mar}, +number = {3}, +pages = {245--252}, +pmid = {26854917}, +publisher = {Nature Publishing Group}, +title = {{Integrative approaches for large-scale transcriptome-wide association studies}}, +url = {http://www.nature.com/articles/ng.3506 http://www.ncbi.nlm.nih.gov/pubmed/26854917 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4767558}, +volume = {48}, +year = {2016} +} +@article{Gamazon2015, +abstract = {Genome-wide association studies (GWAS) have identified thousands of variants robustly associated with complex traits. However, the biological mechanisms underlying these associations are, in general, not well understood. We propose a gene-based association method called PrediXcan that directly tests the molecular mechanisms through which genetic variation affects phenotype. The approach estimates the component of gene expression determined by an individual's genetic profile and correlates 'imputed' gene expression with the phenotype under investigation to identify genes involved in the etiology of the phenotype. Genetically regulated gene expression is estimated using whole-genome tissue-dependent prediction models trained with reference transcriptome data sets. PrediXcan enjoys the benefits of gene-based approaches such as reduced multiple-testing burden and a principled approach to the design of follow-up experiments. Our results demonstrate that PrediXcan can detect known and new genes associated with disease traits and provide insights into the mechanism of these associations.}, +author = {Gamazon, Eric R. and Wheeler, Heather E. and Shah, Kaanan P. and Mozaffari, Sahar V. and Aquino-Michaels, Keston and Carroll, Robert J. and Eyler, Anne E. and Denny, Joshua C. and Nicolae, Dan L. and Cox, Nancy J. and Im, Hae Kyung}, +doi = {10.1038/ng.3367}, +isbn = {1061-4036}, +issn = {15461718}, +journal = {Nat. Genet.}, +keywords = {TWAS}, +mendeley-tags = {TWAS}, +month = {sep}, +number = {9}, +pages = {1091--1098}, +pmid = {26258848}, +title = {{A gene-based association method for mapping traits using reference transcriptome data}}, +url = {http://www.ncbi.nlm.nih.gov/pubmed/26258848 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4552594 http://www.nature.com/articles/ng.3367}, +volume = {47}, +year = {2015} +} +@misc{Lonsdale2013, +abstract = {Genome-wide association studies have identified thousands of loci for common diseases, but, for the majority of these, the mechanisms underlying disease susceptibility remain unknown. Most associated variants are not correlated with protein-coding changes, suggesting that polymorphisms in regulatory regions probably contribute to many disease phenotypes. Here we describe the Genotype-Tissue Expression (GTEx) project, which will establish a resource database and associated tissue bank for the scientific community to study the relationship between genetic variation and gene expression in human tissues.}, +archivePrefix = {arXiv}, +arxivId = {NIHMS150003}, +author = {Lonsdale, John and Thomas, Jeffrey and Salvatore, Mike and Phillips, Rebecca and Lo, Edmund and Shad, Saboor and Hasz, Richard and Walters, Gary and Garcia, Fernando and Young, Nancy and Foster, Barbara and Moser, Mike and Karasik, Ellen and Gillard, Bryan and Ramsey, Kimberley and Sullivan, Susan and Bridge, Jason and Magazine, Harold and Syron, John and Fleming, Johnelle and Siminoff, Laura and Traino, Heather and Mosavel, Maghboeba and Barker, Laura and Jewell, Scott and Rohrer, Dan and Maxim, Dan and Filkins, Dana and Harbach, Philip and Cortadillo, Eddie and Berghuis, Bree and Turner, Lisa and Hudson, Eric and Feenstra, Kristin and Sobin, Leslie and Robb, James and Branton, Phillip and Korzeniewski, Greg and Shive, Charles and Tabor, David and Qi, Liqun and Groch, Kevin and Nampally, Sreenath and Buia, Steve and Zimmerman, Angela and Smith, Anna and Burges, Robin and Robinson, Karna and Valentino, Kim and Bradbury, Deborah and Cosentino, Mark and Diaz-Mayoral, Norma and Kennedy, Mary and Engel, Theresa and Williams, Penelope and Erickson, Kenyon and Ardlie, Kristin and Winckler, Wendy and Getz, Gad and DeLuca, David and {Daniel MacArthur} and Kellis, Manolis and Thomson, Alexander and Young, Taylor and Gelfand, Ellen and Donovan, Molly and Meng, Yan and Grant, George and Mash, Deborah and Marcus, Yvonne and Basile, Margaret and Liu, Jun and Zhu, Jun and Tu, Zhidong and Cox, Nancy J. and Nicolae, Dan L. and Gamazon, Eric R. and Im, Hae Kyung and Konkashbaev, Anuar and Pritchard, Jonathan and Stevens, Matthew and Flutre, Timoth{\`{e}}e and Wen, Xiaoquan and Dermitzakis, Emmanouil T. and Lappalainen, Tuuli and Guigo, Roderic and Monlong, Jean and Sammeth, Michael and Koller, Daphne and Battle, Alexis and Mostafavi, Sara and McCarthy, Mark and Rivas, Manual and Maller, Julian and Rusyn, Ivan and Nobel, Andrew and Wright, Fred and Shabalin, Andrey and Feolo, Mike and Sharopova, Nataliya and Sturcke, Anne and Paschal, Justin and Anderson, James M. and Wilder, Elizabeth L. and Derr, Leslie K. and Green, Eric D. and Struewing, Jeffery P. and Temple, Gary and Volpi, Simona and Boyer, Joy T. and Thomson, Elizabeth J. and Guyer, Mark S. and Ng, Cathy and Abdallah, Assya and Colantuoni, Deborah and Insel, Thomas R. and Koester, Susan E. and {A Roger Little} and Bender, Patrick K. and Lehner, Thomas and Yao, Yin and Compton, Carolyn C. and Vaught, Jimmie B. and Sawyer, Sherilyn and Lockhart, Nicole C. and Demchok, Joanne and Moore, Helen F.}, +booktitle = {Nat. Genet.}, +doi = {10.1038/ng.2653}, +eprint = {NIHMS150003}, +isbn = {1061-4036}, +issn = {10614036}, +number = {6}, +pages = {580--585}, +pmid = {23715323}, +title = {{The Genotype-Tissue Expression (GTEx) project}}, +volume = {45}, +year = {2013} +} +@article{Ramasamy2014, +abstract = {Germ-line genetic control of gene expression occurs via expression quantitative trait loci (eQTLs). We present a large, exon-specific eQTL data set covering ten human brain regions. We found that cis-eQTL signals (within 1 Mb of their target gene) were numerous, and many acted heterogeneously among regions and exons. Co-regulation analysis of shared eQTL signals produced well-defined modules of region-specific co-regulated genes, in contrast to standard coexpression analysis of the same samples. We report cis-eQTL signals for 23.1{\%} of catalogued genome-wide association study hits for adult-onset neurological disorders. The data set is publicly available via public data repositories and via http://www.braineac.org/. Our study increases our understanding of the regulation of gene expression in the human brain and will be of value to others pursuing functional follow-up of disease-associated variants.}, +author = {Ramasamy, A and Trabzuni, D and Guelfi, S and Varghese, V and Smith, C and Walker, R and De, T and Consortium, U K Brain Expression and {North American Brain Expression}, Consortium and Coin, L and de Silva, R and Cookson, M R and Singleton, A B and Hardy, J and Ryten, M and Weale, M E}, +doi = {10.1038/nn.3801}, +isbn = {1546-1726 (Electronic)$\backslash$r1097-6256 (Linking)}, +issn = {1546-1726; 1097-6256}, +journal = {Nat. Neurosci.}, +keywords = {*Gene Expression Regulation,*Genetic Predisposition to Disease,*Quantitative Trait Loci,Adolescent,Adult,Aged,Aged, 80 and over,Brain/*anatomy {\&} histology/metabolism,Female,Gene Expression Profiling,Genetic Association Studies,Humans,Male,Middle Aged,Nervous System Diseases/*genetics/*pathology,Oligonucleotide Array Sequence Analysis,Polymorphism, Single Nucleotide,Young Adult}, +number = {10}, +pages = {1418--1428}, +pmid = {25174004}, +title = {{Genetic variability in the regulation of gene expression in ten regions of the human brain}}, +url = {http://www.ncbi.nlm.nih.gov/pubmed/25174004}, +volume = {17}, +year = {2014} +} +@article{Visscher2008, +abstract = {Heritability allows a comparison of the relative importance of genes and environment to the variation of traits within and across populations. The concept of heritability and its definition as an estimable, dimensionless population parameter was introduced by Sewall Wright and Ronald Fisher nearly a century ago. Despite continuous misunderstandings and controversies over its use and application, heritability remains key to the response to selection in evolutionary biology and agriculture, and to the prediction of disease risk in medicine. Recent reports of substantial heritability for gene expression and new estimation methods using marker data highlight the relevance of heritability in the genomics era.}, +archivePrefix = {arXiv}, +arxivId = {arXiv:1011.1669v3}, +author = {Visscher, Peter M and Hill, William G and Wray, Naomi R}, +doi = {10.1038/nrg2322}, +eprint = {arXiv:1011.1669v3}, +issn = {1471-0064}, +journal = {Nat. Rev. Genet.}, +number = {4}, +pages = {255--266}, +pmid = {18319743}, +title = {{Heritability in the genomics era--concepts and misconceptions.}}, +volume = {9}, +year = {2008} +} +@article{Giambartolomei2014, +abstract = {Genetic association studies, in particular the genome-wide association study design, have provided a wealth of novel insights into the aetiology of a wide range of human diseases and traits. The next challenge consists of understanding the molecular basis of these associations. The integration of multiple association datasets, including gene expression datasets, can contribute to this goal. We have developed a novel statistical methodology to assess whether two association signals are consistent with a shared causal variant. An application is the integration of disease scans with expression quantitative trait locus (eQTL) studies, but any pair of GWAS datasets can be integrated in this framework. We demonstrate the value of the approach by reanalysing a gene expression dataset in 966 liver samples with a published meta-analysis of lipid traits including {\textgreater}100, 000 individuals of European ancestry. Combining all lipid biomarkers, our reanalysis supported 29 out of 38 reported colocalisation results with eQTLs and identified 14 new colocalisation results, highlighting the value of a formal statistical test. In two cases of reported eQTL-lipid pairs (IFT172, TBKBP1) for which our analysis suggests that the eQTL pattern is not consistent with the lipid association, we identify alternative colocalisation results with GCKR and KPNB1, indicating that these genes are more likely to be causal in these genomic intervals. A key feature of the method is the ability to derive the output statistics from single SNP summary statistics, hence making it possible to perform systematic meta-analysis type comparisons across multiple GWAS datasets (http://coloc.cs.ucl.ac.uk/coloc/). Our methodology provides information about candidate causal genes in associated intervals and has direct implications for the understanding of complex diseases and the design of drugs to target disease pathways.}, +archivePrefix = {arXiv}, +arxivId = {1305.4022}, +author = {Giambartolomei, Claudia and Vukcevic, Damjan and Schadt, Eric E. and Franke, Lude and Hingorani, Aroon D. and Wallace, Chris and Plagnol, Vincent}, +doi = {10.1371/journal.pgen.1004383}, +eprint = {1305.4022}, +isbn = {1553-7404 (Electronic)$\backslash$r1553-7390 (Linking)}, +issn = {15537404}, +journal = {PLoS Genet.}, +number = {5}, +pmid = {24830394}, +title = {{Bayesian Test for Colocalisation between Pairs of Genetic Association Studies Using Summary Statistics}}, +volume = {10}, +year = {2014} +} +@article{Ardlie2015, +abstract = {Understanding the functional consequences of genetic variation, and how it affects complex human disease and quantitative traits, remains a critical challenge for biomedicine. We present an analysis of RNA sequencing data from 1641 samples across 43 tissues from 175 individuals, generated as part of the pilot phase of the Genotype-Tissue Expression (GTEx) project. We describe the landscape of gene expression across tissues, catalog thousands of tissue-specific and shared regulatory expression quantitative trait loci (eQTL) variants, describe complex network relationships, and identify signals from genome-wide association studies explained by eQTLs. These findings provide a systematic understanding of the cellular and biological consequences of human genetic variation and of the heterogeneity of such effects among a diverse set of human tissues.$\backslash$nExpression, genetic variation, and tissues$\backslash$nHuman genomes show extensive genetic variation across individuals, but we have only just started documenting the effects of this variation on the regulation of gene expression. Furthermore, only a few tissues have been examined per genetic variant. In order to examine how genetic expression varies among tissues within individuals, the Genotype-Tissue Expression (GTEx) Consortium collected 1641 postmortem samples covering 54 body sites from 175 individuals. They identified quantitative genetic traits that affect gene expression and determined which of these exhibit tissue-specific expression patterns. Mel{\'{e}} et al. measured how transcription varies among tissues, and Rivas et al. looked at how truncated protein variants affect expression across tissues.$\backslash$nScience, this issue p. 648, p. 660, p. 666; see also p. 640}, +author = {Ardlie, Kristin G. and DeLuca, David S. and Segr{\`{e}}, Ayellet V. and Sullivan, Timothy J. and Young, Taylor R. and Gelfand, Ellen T. and Trowbridge, Casandra A. and Maller, Julian B. and Tukiainen, Taru and Lek, Monkol and Ward, Lucas D. and Kheradpour, Pouya and Iriarte, Benjamin and Meng, Yan and Palmer, Cameron D. and Esko, T{\~{o}}nu and Winckler, Wendy and Hirschhorn, Joel N. and Kellis, Manolis and MacArthur, Daniel G. and Getz, Gad and Shabalin, Andrey A. and Li, Gen and Zhou, Yi Hui and Nobel, Andrew B. and Rusyn, Ivan and Wright, Fred A. and Lappalainen, Tuuli and Ferreira, Pedro G. and Ongen, Halit and Rivas, Manuel A. and Battle, Alexis and Mostafavi, Sara and Monlong, Jean and Sammeth, Michael and Mel{\'{e}}, Marta and Reverter, Ferran and Goldmann, Jakob M. and Koller, Daphne and Guig{\'{o}}, Roderic and McCarthy, Mark I. and Dermitzakis, Emmanouil T. and Gamazon, Eric R. and Im, Hae Kyung and Konkashbaev, Anuar and Nicolae, Dan L. and Cox, Nancy J. and Flutre, Timoth{\'{e}}e and Wen, Xiaoquan and Stephens, Matthew and Pritchard, Jonathan K. and Tu, Zhidong and Zhang, Bin and Huang, Tao and Long, Quan and Lin, Luan and Yang, Jialiang and Zhu, Jun and Liu, Jun and Brown, Amanda and Mestichelli, Bernadette and Tidwell, Denee and Lo, Edmund and Salvatore, Michael and Shad, Saboor and Thomas, Jeffrey A. and Lonsdale, John T. and Moser, Michael T. and Gillard, Bryan M. and Karasik, Ellen and Ramsey, Kimberly and Choi, Christopher and Foster, Barbara A. and Syron, John and Fleming, Johnell and Magazine, Harold and Hasz, Rick and Walters, Gary D. and Bridge, Jason P. and Miklos, Mark and Sullivan, Susan and Barker, Laura K. and Traino, Heather M. and Mosavel, Maghboeba and Siminoff, Laura A. and Valley, Dana R. and Rohrer, Daniel C. and Jewell, Scott D. and Branton, Philip A. and Sobin, Leslie H. and Barcus, Mary and Qi, Liqun and McLean, Jeffrey and Hariharan, Pushpa and Um, Ki Sung and Wu, Shenpei and Tabor, David and Shive, Charles and Smith, Anna M. and Buia, Stephen A. and Undale, Anita H. and Robinson, Karna L. and Roche, Nancy and Valentino, Kimberly M. and Britton, Angela and Burges, Robin and Bradbury, Debra and Hambright, Kenneth W. and Seleski, John and Korzeniewski, Greg E. and Erickson, Kenyon and Marcus, Yvonne and Tejada, Jorge and Taherian, Mehran and Lu, Chunrong and Basile, Margaret and Mash, Deborah C. and Volpi, Simona and Struewing, Jeffery P. and Temple, Gary F. and Boyer, Joy and Colantuoni, Deborah and Little, Roger and Koester, Susan and Carithers, Latarsha J. and Moore, Helen M. and Guan, Ping and Compton, Carolyn and Sawyer, Sherilyn J. and Demchok, Joanne P. and Vaught, Jimmie B. and Rabiner, Chana A. and Lockhart}, +doi = {10.1126/science.1262110}, +isbn = {0036-8075}, +issn = {10959203}, +journal = {Science (80-. ).}, +number = {6235}, +pages = {648--660}, +pmid = {25954001}, +title = {{The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans}}, +volume = {348}, +year = {2015} +} + + + + --- --- @@ -23,7 +253,7 @@ @misc{liang2024controllabletextgenerationlarge @misc{hf2024quantization, title={Quantization in Optimum}, - author={Hugging Face}, + author={HuggingFace}, year={2024s}, howpublished={\url{https://huggingface.co/docs/optimum/en/concept_guides/quantization}}, note={Accessed: 2024} @@ -41,7 +271,7 @@ @misc{mistraltechnology2024 @misc{hf2024yearinreview, title={Open Source AI Year in Review 2024}, - author={Hugging Face}, + author={HuggingFace}, year={2024t}, howpublished={\url{https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024}}, note={Accessed: 2024} @@ -131,7 +361,7 @@ @misc{li2024leveraginglargelanguagemodels @misc{huggingface2024llmjudge, title={LLM as a Judge}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, howpublished={\url{https://huggingface.co/learn/cookbook/en/llm_judge}}, note={Accessed: 2024} @@ -212,7 +442,7 @@ @article{qwen2 @misc{lighteval_tasks, title={Available Tasks - LightEval Wiki}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, howpublished={\url{https://github.com/huggingface/lighteval/wiki/Available-Tasks}}, note={Accessed: 2024} @@ -220,7 +450,7 @@ @misc{lighteval_tasks @misc{lighteval_metrics, title={Metric List - LightEval Wiki}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, howpublished={\url{https://github.com/huggingface/lighteval/wiki/Metric-List}}, note={Accessed: 2024} @@ -228,7 +458,7 @@ @misc{lighteval_metrics @misc{lighteval_server, title={Evaluate the model on a server or container - LightEval Wiki}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, howpublished={\url{https://github.com/huggingface/lighteval/wiki/Evaluate-the-model-on-a-server-or-container}}, note={Accessed: 2024} @@ -237,8 +467,8 @@ @misc{lighteval_server @misc{gpt2docs, - title={GPT-2 Documentation - Hugging Face Transformers}, - author={Hugging Face}, + title={GPT-2 Documentation - HuggingFace Transformers}, + author={HuggingFace}, year={2024}, howpublished={\url{https://huggingface.co/docs/transformers/model_doc/gpt2}}, note={Accessed: 2024} @@ -258,15 +488,15 @@ @misc{qwen_openrouter_usage } @misc{hf_num_models, - title={Number of Models on Hugging Face}, - author={{Hugging Face}}, + title={Number of Models on HuggingFace}, + author={{HuggingFace}}, year={2024}, howpublished={\url{https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024?day=4}}, note={Accessed: 12/06/2024} } @misc{meta_llama_models, - title={Meta Llama Models on Hugging Face}, + title={Meta Llama Models on HuggingFace}, author={{Meta AI}}, year={2024}, howpublished={\url{https://huggingface.co/meta-llama}}, @@ -321,15 +551,6 @@ @article{long2024llms year={2024} } - -@misc{langchain_github, - title={LangChain}, - author={{LangChain}}, - year={2024z}, - howpublished={\url{https://github.com/langchain-ai/langchain}}, - note={Accessed: 12/07/2024} -} - @misc{langchain_text_splitters, title={Text Splitters - LangChain Documentation}, author={{LangChain}}, @@ -390,14 +611,6 @@ @misc{srivastava2023imitationgamequantifyingextrapolating url={https://arxiv.org/abs/2206.04615}, } -@misc{ollama2024website, - title={Ollama: Get up and running with large language models, locally}, - author={Ollama}, - year={2024}, - howpublished={Website}, - url={https://ollama.com} -} - @book{kamath2024large, title={Large Language Models: A Deep Dive: Bridging Theory and Practice}, author={Kamath, U. and Keenan, K. and Somers, G. and Sorenson, S.}, @@ -459,9 +672,9 @@ @misc{chiang2024chatbotarenaopenplatform @misc{openllmleaderboard2024, title={Open LLM Leaderboard}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, - howpublished={Hugging Face Spaces}, + howpublished={HuggingFace Spaces}, url={https://huggingface.co/spaces/open-llm-leaderboard/blog}, } @@ -551,7 +764,7 @@ @misc{willard2023efficientguidedgenerationlarge @misc{smollm2024model, title={SmolLM2-360M-Instruct}, - author={Hugging Face SmolLM2-360M-Instruct}, + author={HuggingFace SmolLM2-360M-Instruct}, year={2024}, url={https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct}, note={360M parameter instruction-tuned language model, distilled for efficient deployment} @@ -565,6 +778,15 @@ @misc{promptfoo2024 note={Open source framework for testing and evaluating LLM prompts} } +@misc{scikit2024evaluation, + title={Model evaluation: quantifying the quality of predictions}, + author={{scikit-learn developers}}, + year={2024}, + howpublished={Website}, + url={https://scikit-learn.org/1.5/modules/model_evaluation.html} +} + + @misc{vidgen2024introducingv05aisafety, title={Introducing v0.5 of the AI Safety Benchmark from MLCommons}, @@ -596,9 +818,28 @@ @misc{wu2024metarewardinglanguagemodelsselfimproving url={https://arxiv.org/abs/2407.19594}, } +@misc{chollet2024tweet, + title={Chollet on ARC-AGI-1}, + author={François Chollet}, + year={2025}, + howpublished={Tweet}, + url={https://x.com/fchollet/status/1874877373629493548}, + note={Accessed: 04/01/2025} +} + +@misc{xie2024ordermattershallucinationreasoning, + title={Order Matters in Hallucination: Reasoning Order as Benchmark and Reflexive Prompting for Large-Language-Models}, + author={Zikai Xie}, + year={2024}, + eprint={2408.05093}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2408.05093}, +} + @misc{smollm2024, title={SmoLLM: A Small Language Model Distilled from a Larger Language Model for Task-specific Applications}, - author={Hugging Face SmolLM2}, + author={HuggingFace SmolLM2}, year={2024}, url={https://huggingface.co/blog/smollm}, note={Blog post describing techniques for distilling smaller, task-specific language models} @@ -681,8 +922,8 @@ @misc{mozilla2024llamafile @misc{huggingface2024llamafilemodels, - title={Llamafile Models on Hugging Face}, - author={{Hugging Face}}, + title={Llamafile Models on HuggingFace}, + author={{HuggingFace}}, year={2024x}, howpublished={Online Repository}, url={https://huggingface.co/models?library=llamafile}, @@ -692,7 +933,7 @@ @misc{huggingface2024llamafilemodels @misc{huggingface2024chattemplating, title={Chat Templating Documentation}, - author={{Hugging Face}}, + author={{HuggingFace}}, year={2024y}, howpublished={Online Documentation}, url={https://huggingface.co/docs/transformers/main/en/chat_templating}, @@ -712,7 +953,7 @@ @misc{meta2024llama2chat70b title={Llama-2-70b-chat-hf}, author={{Meta AI}}, year={2024c}, - howpublished={Hugging Face Model}, + howpublished={HuggingFace Model}, url={https://huggingface.co/meta-llama/Llama-2-70b-chat-hf}, note={70 billion parameter chat model from Meta's Llama 2 family} } @@ -732,7 +973,7 @@ @misc{unsloth2024llama3 title={Llama-3.3-70B-Instruct-GGUF}, author={{Unsloth}}, year={2024}, - howpublished={Hugging Face Model}, + howpublished={HuggingFace Model}, url={https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF}, note={GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model} } @@ -781,7 +1022,7 @@ @misc{salesforce2024wikitext title={WikiText Dataset}, author={{Salesforce}}, year={2024}, - howpublished={Hugging Face Dataset}, + howpublished={HuggingFace Dataset}, url={https://huggingface.co/datasets/Salesforce/wikitext}, note={Large-scale dataset derived from verified Good and Featured articles on Wikipedia} } @@ -814,14 +1055,40 @@ @misc{deepseek2024v3 url={https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf} } -@misc{lee2024longcontextlanguagemodelssubsume, - title={Can Long-Context Language Models Subsume Retrieval, RAG, SQL, and More?}, - author={Jinhyuk Lee and Anthony Chen and Zhuyun Dai and Dheeru Dua and Devendra Singh Sachan and Michael Boratko and Yi Luan and Sébastien M. R. Arnold and Vincent Perot and Siddharth Dalmia and Hexiang Hu and Xudong Lin and Panupong Pasupat and Aida Amini and Jeremy R. Cole and Sebastian Riedel and Iftekhar Naim and Ming-Wei Chang and Kelvin Guu}, - year={2024}, - eprint={2406.13121}, - archivePrefix={arXiv}, - primaryClass={cs.CL}, - url={https://arxiv.org/abs/2406.13121}, +@misc{docling2024github, + title={DocLing: A Document-Level Linguistic Annotation Framework}, + author={{IBM Research}}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/DS4SD/docling}, + note={Framework for document-level linguistic annotation and analysis} +} + +@misc{unstructured2024github, + title={Unstructured: Open Source Libraries for Pre-Processing Documents}, + author={{Unstructured.io}}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/Unstructured-IO/unstructured} +} + +@misc{mendable2024firecrawl, + title={FireCrawl: A Fast and Efficient Web Crawler for LLM Training Data}, + author={{Mendable AI}}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/mendableai/firecrawl}, + note={High-performance web crawler optimized for collecting LLM training data} +} + + +@misc{microsoft2024markitdown, + title={MarkItDown: Structured Generation with Large Language Models}, + author={{Microsoft}}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/microsoft/markitdown}, + note={Framework for structured text generation using LLMs} } @@ -846,7 +1113,7 @@ @misc{a16z2024llmflation @misc{huggingface2024quantization, title={GGUF Quantization Types}, - author={{Hugging Face}}, + author={{HuggingFace}}, year={2024w}, howpublished={Online Documentation}, url={https://huggingface.co/docs/hub/gguf#quantization-types}, @@ -895,8 +1162,8 @@ @misc{ibm2024ggufversusggml } @misc{huggingface2024ggufmodels, - title={GGUF Models on Hugging Face}, - author={{Hugging Face}}, + title={GGUF Models on HuggingFace}, + author={{HuggingFace}}, year={2024x}, howpublished={Online Repository}, url={https://huggingface.co/models?search=gguf}, @@ -926,6 +1193,46 @@ @misc{singh2024globalmmluunderstandingaddressing url={https://arxiv.org/abs/2412.03304}, } +@book{kimothi2024simpleguiderag, + title={A Simple Guide to Retrieval Augmented Generation}, + author={Kimothi, Abhinav}, + year={2024}, + publisher={Manning Publications}, + isbn={9781633435858}, + note={Manning Early Access Program (MEAP)}, + url={https://www.manning.com/books/a-simple-guide-to-retrieval-augmented-generation} +} + + +@book{hands-on-llms-book, + author = {Jay Alammar and Maarten Grootendorst}, + title = {Hands-On Large Language Models}, + publisher = {O'Reilly}, + year = {2024}, + isbn = {978-1098150969}, + url = {https://www.oreilly.com/library/view/hands-on-large-language/9781098150952/}, + github = {https://github.com/HandsOnLLM/Hands-On-Large-Language-Models} +} + +@misc{diamant2024ragtechniques, + title={RAG Techniques}, + author={Nir Diamant}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/NirDiamant/RAG_Techniques}, + note={Collection of advanced RAG techniques and implementation patterns} +} + + +@misc{athinaai2024ragcookbooks, + title={RAG Cookbooks}, + author={{AthinaAI}}, + year={2024}, + howpublished={GitHub Repository}, + url={https://github.com/athina-ai/rag-cookbooks}, + note={Collection of recipes and best practices for building RAG applications} +} + @book{huyen2024aiengineering, @@ -989,7 +1296,7 @@ @misc{mlcommons2024lead @misc{ultrafeedback2024, title={UltraFeedback Binarized Dataset}, - author={Hugging Face H4}, + author={HuggingFace H4}, year={2024a}, url={https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized}, note={A dataset of binary preference data for training language models} @@ -998,18 +1305,18 @@ @misc{ultrafeedback2024 @misc{ultrafeedback2024z, title={UltraFeedback Binarized Dataset}, - author={Hugging Face H4}, + author={HuggingFace H4}, year={2024z}, url={https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized}, note={A dataset of binary preference data for training language models} } @misc{huggingfaceh42024, - title={Hugging Face H4}, - author={Hugging Face H4}, + title={HuggingFace H4}, + author={HuggingFace H4}, year={2024b}, url={https://huggingface.co/HuggingFaceH4}, - note={Hugging Face H4} + note={HuggingFace H4} } @misc{evalstamingllms2024, @@ -1098,7 +1405,7 @@ @misc{grattafiori2024llama3herdmodels @misc{hf2024scalingtesttime, title={Scaling Test Time Compute}, - author={Hugging Face}, + author={HuggingFace}, year={2024v}, url={https://huggingface.co/spaces/HuggingFaceH4/blogpost-scaling-test-time-compute}, note={Accessed: 2024} @@ -1106,7 +1413,7 @@ @misc{hf2024scalingtesttime @misc{hf2024ultrachat200k, title={UltraChat-200K Dataset}, - author={Hugging Face}, + author={HuggingFace}, year={2024u}, url={https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k}, note={Accessed: 2024} @@ -1166,7 +1473,7 @@ @misc{ouyang2022traininglanguagemodelsfollow @misc{zephyr2024, title={Zephyr}, - author={Hugging Face}, + author={HuggingFace}, year={2024}, url={https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha}, note={Zephyr} @@ -1373,7 +1680,7 @@ @misc{xu2024dposuperiorppollm @misc{huggingface2024rlhf, title={RLHF}, - author={Hugging Face}, + author={HuggingFace}, year={2024c}, url={https://huggingface.co/blog/rlhf}, note={RLHF} @@ -1412,7 +1719,7 @@ @techreport{finra2024llmguidance24 @misc{huggingface2024trl, title={TRL}, - author={Hugging Face}, + author={HuggingFace}, year={2024d}, url={https://huggingface.co/docs/trl/en/index}, note={TRL} @@ -1590,7 +1897,7 @@ @misc{zhou2024stealtheditshf title={Stealth Edits: Detecting Stealth Edits in LLM Outputs}, author={Qinghua Zhou}, year={2024}, - howpublished={Hugging Face Spaces}, + howpublished={HuggingFace Spaces}, url={https://huggingface.co/spaces/qinghua-zhou/stealth-edits}, } @@ -1767,7 +2074,7 @@ @misc{opensafetylab2024saladdata title={Salad-Data: A Hierarchical and Comprehensive Safety Dataset for Large Language Models}, author={{OpenSafetyLab}}, year={2024}, - howpublished={Hugging Face Dataset}, + howpublished={HuggingFace Dataset}, url={https://huggingface.co/datasets/OpenSafetyLab/Salad-Data}, } @@ -1775,7 +2082,7 @@ @misc{opensafetylab2024saladbenchleaderboard title={Salad-Bench Leaderboard}, author={{OpenSafetyLab}}, year={2024}, - howpublished={Hugging Face Space}, + howpublished={HuggingFace Space}, url={https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard}, } @@ -1939,3 +2246,263 @@ @misc{mistral2024 } +@article{ren2024reconciling, + title={Reconciling the contrasting narratives on the environmental impact of large language models}, + author={Ren, Shaolei and others}, + journal={Scientific Reports}, + volume={14}, + number={1}, + pages={26310}, + year={2024}, + publisher={Nature Publishing Group}, + doi={10.1038/s41598-024-76682-6} +} + +@misc{epa2023greenhouse, + title={Greenhouse Gas Emissions from a Typical Passenger Vehicle}, + author={{United States Environmental Protection Agency}}, + year={2023}, + howpublished={Website}, + url={https://www.epa.gov/greenvehicles/greenhouse-gas-emissions-typical-passenger-vehicle} +} + +@misc{anthropic2024statistical, + title={A Statistical Approach to Model Evaluation}, + author={{Anthropic}}, + year={2024}, + howpublished={Website}, + url={https://www.anthropic.com/research/statistical-approach-to-model-evals} +} + +@misc{bcs2024deadend, + title={Does current AI represent a dead end?}, + author={{British Computer Society}}, + year={2024}, + howpublished={Website}, + url={https://www.bcs.org/articles-opinion-and-research/does-current-ai-represent-a-dead-end/} +} + + + +@article{oketunji2023a, + edition = {}, + number = {}, + journal = {Data \& Policy}, + pages = {}, + publisher = {Cambridge University Press}, + school = {}, + title = {Large Language Model (LLM) Bias Index—LLMBI}, + volume = {}, + author = {Oketunji, AF and Anas, M and Saina, D}, + editor = {}, + year = {2023}, + series = {} +} + +@misc{chen2023combatingmisinformationagellms, + title={Combating Misinformation in the Age of LLMs: Opportunities and Challenges}, + author={Canyu Chen and Kai Shu}, + year={2023}, + eprint={2311.05656}, + archivePrefix={arXiv}, + primaryClass={cs.CY}, + url={https://arxiv.org/abs/2311.05656}, +} + +@misc{crfm2021website, + title={Introducing the Center for Research on Foundation Models (CRFM)}, + author={{Stanford HAI}}, + year={2021}, + howpublished={Website}, + url={https://hai.stanford.edu/news/introducing-center-research-foundation-models-crfm} +} + +@misc{xu2024benchmarkdatacontaminationlarge, + title={Benchmark Data Contamination of Large Language Models: A Survey}, + author={Cheng Xu and Shuhao Guan and Derek Greene and M-Tahar Kechadi}, + year={2024}, + eprint={2406.04244}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.04244}, +} + + +@misc{ollama2024website, + title={Ollama: Get up and running with large language models, locally}, + author={Ollama}, + year={2024}, + howpublished={Website}, + url={https://ollama.com} +} + +@misc{langchain_github, + title={LangChain}, + author={{LangChain}}, + year={2024z}, + howpublished={\url{https://github.com/langchain-ai/langchain}}, + note={Accessed: 12/07/2024} +} + +@misc{bengio2014representationlearningreviewnew, + title={Representation Learning: A Review and New Perspectives}, + author={Yoshua Bengio and Aaron Courville and Pascal Vincent}, + year={2014}, + eprint={1206.5538}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/1206.5538}, +} + +@misc{liu2024enhancingllmscognitionstructurization, + title={Enhancing LLM's Cognition via Structurization}, + author={Kai Liu and Zhihang Fu and Chao Chen and Wei Zhang and Rongxin Jiang and Fan Zhou and Yaowu Chen and Yue Wu and Jieping Ye}, + year={2024}, + eprint={2407.16434}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2407.16434}, +} + +@misc{jacob2024drowningdocumentsconsequencesscaling, + title={Drowning in Documents: Consequences of Scaling Reranker Inference}, + author={Mathew Jacob and Erik Lindgren and Matei Zaharia and Michael Carbin and Omar Khattab and Andrew Drozdov}, + year={2024}, + eprint={2411.11767}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2411.11767}, +} + +@misc{li2024retrollmempoweringlargelanguage, + title={RetroLLM: Empowering Large Language Models to Retrieve Fine-grained Evidence within Generation}, + author={Xiaoxi Li and Jiajie Jin and Yujia Zhou and Yongkang Wu and Zhonghua Li and Qi Ye and Zhicheng Dou}, + year={2024}, + eprint={2412.11919}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2412.11919}, +} + + + +@misc{wei2023chainofthoughtpromptingelicitsreasoning, + title={Chain-of-Thought Prompting Elicits Reasoning in Large Language Models}, + author={Jason Wei and Xuezhi Wang and Dale Schuurmans and Maarten Bosma and Brian Ichter and Fei Xia and Ed Chi and Quoc Le and Denny Zhou}, + year={2023}, + eprint={2201.11903}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2201.11903}, +} + +@misc{lee2024longcontextlanguagemodelssubsume, + title={Can Long-Context Language Models Subsume Retrieval, RAG, SQL, and More?}, + author={Jinhyuk Lee and Anthony Chen and Zhuyun Dai and Dheeru Dua and Devendra Singh Sachan and Michael Boratko and Yi Luan and Sébastien M. R. Arnold and Vincent Perot and Siddharth Dalmia and Hexiang Hu and Xudong Lin and Panupong Pasupat and Aida Amini and Jeremy R. Cole and Sebastian Riedel and Iftekhar Naim and Ming-Wei Chang and Kelvin Guu}, + year={2024}, + eprint={2406.13121}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.13121}, +} + +@misc{li2024retrievalaugmentedgenerationlongcontext, + title={Retrieval Augmented Generation or Long-Context LLMs? A Comprehensive Study and Hybrid Approach}, + author={Zhuowan Li and Cheng Li and Mingyang Zhang and Qiaozhu Mei and Michael Bendersky}, + year={2024}, + eprint={2407.16833}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2407.16833}, +} + +@misc{ragas2024evaluation, + title={RAG Evaluation - Ragas Documentation}, + author={{Ragas}}, + year={2024}, + howpublished={Website}, + url={https://docs.ragas.io/en/stable/getstarted/rag_evaluation/} +} + + +@misc{wu2024longdocumentsummaryevaluation, + title={Less is More for Long Document Summary Evaluation by LLMs}, + author={Yunshu Wu and Hayate Iso and Pouya Pezeshkpour and Nikita Bhutani and Estevam Hruschka}, + year={2024}, + eprint={2309.07382}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2309.07382}, +} + +@misc{he2024doespromptformattingimpact, + title={Does Prompt Formatting Have Any Impact on LLM Performance?}, + author={Jia He and Mukund Rungta and David Koleczek and Arshdeep Sekhon and Franklin X Wang and Sadid Hasan}, + year={2024}, + eprint={2411.10541}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2411.10541}, +} + +@misc{huggingface2024mteb, + title={Massive Text Embedding Benchmark (MTEB) Leaderboard}, + author={{HuggingFace}}, + year={2024i}, + howpublished={Website}, + url={https://huggingface.co/spaces/mteb/leaderboard} +} + + +@misc{llamaindex2024storing, + title={Storing - LlamaIndex Documentation}, + author={{LlamaIndex}}, + year={2024}, + howpublished={Website}, + url={https://docs.llamaindex.ai/en/stable/understanding/storing/storing/} +} + + +@misc{zenml2024rag, + title={Scaling RAG Accuracy from 49\% to 86\% in Finance Q\&A Assistant}, + author={{ZenML}}, + year={2024}, + howpublished={Website}, + url={https://www.zenml.io/llmops-database/scaling-rag-accuracy-from-49-to-86-in-finance-q-a-assistant} +} + + +@misc{sentencetransformers2024website, + title={Sentence Transformers}, + author={{HuggingFace}}, + year={2024f}, + howpublished={Website}, + url={https://huggingface.co/sentence-transformers} +} + + +@misc{chromadb2024docs, + title={ChromaDB Documentation}, + author={{ChromaDB}}, + year={2024b}, + howpublished={Website}, + url={https://docs.trychroma.com/} +} + + +@misc{openai2024embeddings, + title={What are embeddings?}, + author={{OpenAI}}, + year={2024}, + howpublished={Website}, + url={https://platform.openai.com/docs/guides/embeddings/what-are-embeddings} +} + +@misc{chromadb2024hnsw, + title={ChromaDB Cookbook: HNSW Configuration}, + author={{ChromaDB}}, + year={2024a}, + howpublished={Website}, + url={https://cookbook.chromadb.dev/core/configuration/#hnsw-configuration} +} +