Skip to content

Commit

Permalink
Merge pull request #34 from parkervg/ingredients-rework
Browse files Browse the repository at this point in the history
Ingredients rework
  • Loading branch information
parkervg authored Oct 15, 2024
2 parents 9daf09f + 9c1b73b commit 5d9c9cd
Show file tree
Hide file tree
Showing 36 changed files with 1,280 additions and 665 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
python -m tox
- name: "Upload coverage data"
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: covdata
path: .coverage.*
Expand All @@ -66,7 +66,7 @@ jobs:
python -m pip install tox tox-gh-actions
- name: "Download coverage data"
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: covdata

Expand Down
121 changes: 121 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,127 @@ print(smoothie.meta.prompts)
}
```

### Few-Shot Prompting
For the LLM-based ingredients in BlendSQL, few-shot prompting can be vital. In `LLMMap`, `LLMQA` and `LLMJoin`, we provide an interface to pass custom few-shot examples and dynamically retrieve those top-`k` most relevant examples at runtime, given the current inference example.
#### `LLMMap`
- [Default examples](./blendsql/ingredients/builtin/map/default_examples.json)
- [All possible fields](./blendsql/ingredients/builtin/map/examples.py)

```python
from blendsql import blend, LLMMap
from blendsql.ingredients.builtin import DEFAULT_MAP_FEW_SHOT

ingredients = {
LLMMap.from_args(
few_shot_examples=[
*DEFAULT_MAP_FEW_SHOT,
{
"question": "Is this a sport?",
"mapping": {
"Soccer": "t",
"Chair": "f",
"Banana": "f",
"Golf": "t"
},
# Below are optional
"column_name": "Items",
"table_name": "Table",
"example_outputs": ["t", "f"],
"options": ["t", "f"],
"output_type": "boolean"
}
],
# Will fetch `k` most relevant few-shot examples using embedding-based retriever
k=2,
# How many inference values to pass to model at once
batch_size=5,
)
}
smoothie = blend(
query=blendsql,
db=db,
ingredients=ingredients,
default_model=model,
)
```

#### `LLMQA`
- [Default examples](./blendsql/ingredients/builtin/qa/default_examples.json)
- [All possible fields](./blendsql/ingredients/builtin/qa/examples.py)

```python
from blendsql import blend, LLMQA
from blendsql.ingredients.builtin import DEFAULT_QA_FEW_SHOT

ingredients = {
LLMQA.from_args(
few_shot_examples=[
*DEFAULT_QA_FEW_SHOT,
{
"question": "Which weighs the most?",
"context": {
{
"Animal": ["Dog", "Gorilla", "Hamster"],
"Weight": ["20 pounds", "350 lbs", "100 grams"]
}
},
"answer": "Gorilla",
# Below are optional
"options": ["Dog", "Gorilla", "Hamster"]
}
],
# Will fetch `k` most relevant few-shot examples using embedding-based retriever
k=2,
# Lambda to turn the pd.DataFrame to a serialized string
context_formatter=lambda df: df.to_markdown(
index=False
)
)
}
smoothie = blend(
query=blendsql,
db=db,
ingredients=ingredients,
default_model=model,
)
```

#### `LLMJoin`
- [Default examples](./blendsql/ingredients/builtin/join/default_examples.json)
- [All possible fields](./blendsql/ingredients/builtin/join/examples.py)

```python
from blendsql import blend, LLMJoin
from blendsql.ingredients.builtin import DEFAULT_JOIN_FEW_SHOT

ingredients = {
LLMJoin.from_args(
few_shot_examples=[
*DEFAULT_JOIN_FEW_SHOT,
{
"join_criteria": "Join the state to its capital.",
"left_values": ["California", "Massachusetts", "North Carolina"],
"right_values": ["Sacramento", "Boston", "Chicago"],
"mapping": {
"California": "Sacramento",
"Massachusetts": "Boston",
"North Carolina": "-"
}
}
],
# Will fetch `k` most relevant few-shot examples using embedding-based retriever
k=2
)
}
smoothie = blend(
query=blendsql,
db=db,
ingredients=ingredients,
default_model=model,
)
```


### Acknowledgements
Special thanks to those below for inspiring this project. Definitely recommend checking out the linked work below, and citing when applicable!

Expand Down
2 changes: 1 addition & 1 deletion blendsql/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def __contains__(cls, item):

DEFAULT_ANS_SEP = ";"
DEFAULT_NAN_ANS = "-"
MAP_BATCH_SIZE = 15
MAP_BATCH_SIZE = 5


class IngredientType(str, Enum, metaclass=StrInMeta):
Expand Down
7 changes: 4 additions & 3 deletions blendsql/_logger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

logging.basicConfig()


def msg_box(msg, indent=1, width=None, title=None):
"""Print message-box with optional title."""
Expand Down Expand Up @@ -77,4 +75,7 @@ def getChild(self, name: str) -> logging.Logger:
return child


logger = Logger("blendsql", logging.DEBUG)
# logging.setLoggerClass(Logger)
logging.basicConfig()
logger = Logger("blendsql")
# logger = logging.getLogger("blendsql")
1 change: 1 addition & 0 deletions blendsql/_smoothie.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class SmoothieMeta:
prompt_tokens: int
completion_tokens: int
prompts: List[dict] # Log of prompts submitted to model
raw_prompts: List[str]
ingredients: Iterable[Type[Ingredient]]
query: str
db_url: str
Expand Down
15 changes: 10 additions & 5 deletions blendsql/blend.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,15 @@ def extend(self, ingredients: Iterable[Type[Ingredient]]) -> None:
name not in self.name_to_ingredient
), f"Duplicate ingredient names passed! These are case insensitive, be careful.\n{name}"
# Initialize the ingredient, going from `Type[Ingredient]` to `Ingredient`
initialied_ingredient: Ingredient = ingredient(
initialized_ingredient: Ingredient = ingredient(
name=name,
# Add db and session_uuid as default kwargs
# This way, ingredients are able to interact with data
db=self.db,
session_uuid=self.session_uuid,
)
self.name_to_ingredient[name] = initialied_ingredient
self.append(initialied_ingredient)
self.name_to_ingredient[name] = initialized_ingredient
self.append(initialized_ingredient)


def autowrap_query(
Expand Down Expand Up @@ -442,6 +442,9 @@ def _blend(
default_model.completion_tokens if default_model is not None else 0
),
prompts=default_model.prompts if default_model is not None else [],
raw_prompts=default_model.raw_prompts
if default_model is not None
else [],
ingredients=[],
query=original_query,
db_url=str(db.db_url),
Expand All @@ -459,6 +462,7 @@ def _blend(
# Mapping from {"QA('does this company...', 'constituents::Name')": 'does this company'...})
function_call_to_res: Dict[str, str] = {}
session_modified_tables = set()
scm = None
# TODO: Currently, as we traverse upwards from deepest subquery,
# if any lower subqueries have an ingredient, we deem the current
# as ineligible for optimization. Maybe this can be improved in the future.
Expand Down Expand Up @@ -892,6 +896,7 @@ def _blend(
if default_model is not None
else 0,
prompts=default_model.prompts if default_model is not None else [],
raw_prompts=default_model.raw_prompts if default_model is not None else [],
ingredients=ingredients,
query=original_query,
db_url=str(db.db_url),
Expand Down Expand Up @@ -1009,8 +1014,8 @@ def blend(
'''
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.ERROR)
for handler in logger.handlers:
handler.setLevel(logging.DEBUG)
start = time.time()
try:
smoothie = _blend(
Expand Down
6 changes: 3 additions & 3 deletions blendsql/ingredients/builtin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .join.main import LLMJoin
from .qa.main import LLMQA
from .map.main import LLMMap
from .join.main import LLMJoin, DEFAULT_JOIN_FEW_SHOT
from .qa.main import LLMQA, DEFAULT_QA_FEW_SHOT
from .map.main import LLMMap, DEFAULT_MAP_FEW_SHOT
from .vqa.main import ImageCaption
from .validate.main import LLMValidate
27 changes: 27 additions & 0 deletions blendsql/ingredients/builtin/join/default_examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[
{
"join_criteria": "Join to same topics.",
"left_values": ["joshua fields", "bob brown", "ron ryan"],
"right_values": [
"ron ryan",
"colby mules",
"bob brown (ice hockey)",
"josh fields (pitcher)"
],
"mapping": {
"joshua fields": "josh fields (pitcher)",
"bob brown": "bob brown (ice hockey)",
"ron ryan": "ron ryan"
}
},
{
"join_criteria": "Join the fruit to its color",
"left_values": ["banana", "apple", "orange"],
"right_values": ["yellow", "red"],
"mapping": {
"banana": "yellow",
"apple": "red",
"orange": "-"
}
}
]
36 changes: 36 additions & 0 deletions blendsql/ingredients/builtin/join/examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from attr import attrs, attrib
from typing import List, Dict

from blendsql.utils import newline_dedent
from blendsql.ingredients.few_shot import Example


@attrs(kw_only=True)
class JoinExample(Example):
join_criteria: str = attrib(default="Join to the same topics.")
left_values: List[str] = attrib()
right_values: List[str] = attrib()

def to_string(self) -> str:
return newline_dedent(
"""
Criteria: {}
Left Values:
{}
Right Values:
{}
Output:
""".format(
self.join_criteria,
"\n".join(self.left_values),
"\n".join(self.right_values),
)
)


@attrs(kw_only=True)
class AnnotatedJoinExample(JoinExample):
mapping: Dict[str, str] = attrib()
Loading

0 comments on commit 5d9c9cd

Please sign in to comment.