update refs

souzatharsis · Dec 19, 2024 · 2a17ca9 · 2a17ca9
1 parent 95bfad4
commit 2a17ca9
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 170 deletions.
diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle
diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree
diff --git a/tamingllms/_build/html/_sources/notebooks/alignment.ipynb b/tamingllms/_build/html/_sources/notebooks/alignment.ipynb
@@ -383,7 +383,7 @@
     "*   **Situational Awareness:** Alignment faking requires the model to be aware of its training context, understanding when it's being trained and when it is not. This awareness can be provided explicitly, such as in prompts, or implicitly through fine-tuning on relevant data.\n",
     "*   **Reasoning Ability:** The model must be capable of reasoning about its situation, understanding the consequences of complying or not complying with the training objective. This can involve explicit chain-of-thought reasoning or more opaque reasoning within the model's activations.\n",
     "\n",
-    "The work by `askell2024alignmentfaking` also provides some key early findings:\n",
+    "The work by {cite}`askell2024alignmentfaking` also provides some key early findings:\n",
     "\n",
     "*   **Emergence without explicit instruction:** LLMs can exhibit alignment faking without being explicitly instructed to do so. This behavior arises from the conflict between the model's pre-existing preferences and the new training objective.\n",
     "*   **Scale-dependent:** Alignment faking is more likely to occur in larger, more capable models. Models like Claude 3 Opus and Claude 3.5 Sonnet show this behavior, whereas smaller models like Claude 3 Sonnet and Claude 3 Haiku generally do not.\n",

diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html
diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js
diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "fcbcf54e",
+   "id": "634f43f5",
    "metadata": {},
    "source": [
     "(intro)=\n",

diff --git a/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb b/tamingllms/_build/jupyter_execute/notebooks/alignment.ipynb
@@ -383,7 +383,7 @@
     "*   **Situational Awareness:** Alignment faking requires the model to be aware of its training context, understanding when it's being trained and when it is not. This awareness can be provided explicitly, such as in prompts, or implicitly through fine-tuning on relevant data.\n",
     "*   **Reasoning Ability:** The model must be capable of reasoning about its situation, understanding the consequences of complying or not complying with the training objective. This can involve explicit chain-of-thought reasoning or more opaque reasoning within the model's activations.\n",
     "\n",
-    "The work by `askell2024alignmentfaking` also provides some key early findings:\n",
+    "The work by {cite}`askell2024alignmentfaking` also provides some key early findings:\n",
     "\n",
     "*   **Emergence without explicit instruction:** LLMs can exhibit alignment faking without being explicitly instructed to do so. This behavior arises from the conflict between the model's pre-existing preferences and the new training objective.\n",
     "*   **Scale-dependent:** Alignment faking is more likely to occur in larger, more capable models. Models like Claude 3 Opus and Claude 3.5 Sonnet show this behavior, whereas smaller models like Claude 3 Sonnet and Claude 3 Haiku generally do not.\n",

diff --git a/tamingllms/notebooks/alignment.ipynb b/tamingllms/notebooks/alignment.ipynb
@@ -383,7 +383,7 @@
     "*   **Situational Awareness:** Alignment faking requires the model to be aware of its training context, understanding when it's being trained and when it is not. This awareness can be provided explicitly, such as in prompts, or implicitly through fine-tuning on relevant data.\n",
     "*   **Reasoning Ability:** The model must be capable of reasoning about its situation, understanding the consequences of complying or not complying with the training objective. This can involve explicit chain-of-thought reasoning or more opaque reasoning within the model's activations.\n",
     "\n",
-    "The work by `askell2024alignmentfaking` also provides some key early findings:\n",
+    "The work by {cite}`askell2024alignmentfaking` also provides some key early findings:\n",
     "\n",
     "*   **Emergence without explicit instruction:** LLMs can exhibit alignment faking without being explicitly instructed to do so. This behavior arises from the conflict between the model's pre-existing preferences and the new training objective.\n",
     "*   **Scale-dependent:** Alignment faking is more likely to occur in larger, more capable models. Models like Claude 3 Opus and Claude 3.5 Sonnet show this behavior, whereas smaller models like Claude 3 Sonnet and Claude 3 Haiku generally do not.\n",

diff --git a/tamingllms/references.bib b/tamingllms/references.bib
@@ -519,15 +519,15 @@ @misc{dong2024selfboostinglargelanguagemodels
 @misc{askell2024alignmentfaking,
       title={Alignment Faking in Large Language Models}, 
       author={Amanda Askell and Jan Brauner and Adrian Colyer and Benjamin Cullen and David Duvenaud and Richard Ngo and Azalia Mirhoseini and Catherine Olsson and Sam Ringer and Liam Skirvin and Jess Smith and Dawn Song and William Saunders and Steinhardt, Jacob},
-      year={2024},
+      year={2024a},
       publisher={Anthropic},
       url={https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf}
 }
 
 @misc{askell2024alignmentfakingreviews,
       title={Alignment Faking in Large Language Models: Reviews}, 
       author={Amanda Askell and Jan Brauner and Adrian Colyer and Benjamin Cullen and David Duvenaud and Richard Ngo and Azalia Mirhoseini and Catherine Olsson and Sam Ringer and Liam Skirvin and Jess Smith and Dawn Song and William Saunders and Steinhardt, Jacob},
-      year={2024},
+      year={2024b},
       publisher={Anthropic},
       url={https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf}
 }