Skip to content

Commit

Permalink
Package default datasets into the docker image (#837)
Browse files Browse the repository at this point in the history
* closes #787

* download demo ds

* script

* Update Dockerfile

---------

Co-authored-by: Yauhen Babakhin <y.babakhin@gmail.com>
  • Loading branch information
pascal-pfeiffer and ybabakhin authored Aug 20, 2024
1 parent aae60dd commit 28536fb
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 97 deletions.
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ RUN make setup && chmod -R 777 /workspace/.venv
RUN mkdir -p /home/llmstudio/mount
ENV H2O_LLM_STUDIO_WORKDIR=/home/llmstudio/mount

# Download the demo datasets and place in the /workspace/demo directory
# Set the environment variable for the demo datasets
ENV H2O_LLM_STUDIO_DEMO_DATASETS=/workspace/demo
COPY download_default_datasets.py /workspace/
RUN python download_default_datasets.py

COPY . /workspace

# Remove unnecessary packages remove build packages again
Expand Down
44 changes: 44 additions & 0 deletions download_default_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

import pandas as pd
from datasets import load_dataset


def download_default_datasets_to_local_folder() -> None:
"""
Downloads the default datasets to a local folder.
The temporary folder is given by the ENV var H2O_LLM_STUDIO_DEMO_DATASETS.
If the ENV var is not set, this function will raise an error.
The datasets are transformed to parquet format and saved in the folder.
"""

path = os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS")
if path is None:
raise ValueError("H2O_LLM_STUDIO_DEMO_DATASETS is not set.")

if not os.path.exists(path):
os.makedirs(path, exist_ok=True)

# Prepare Causal Language Modeling Dataset
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
df.to_parquet(os.path.join(path, "causal_language_modeling.pq"), index=False)

# Prepare DPO Modeling Dataset
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
df.to_parquet(os.path.join(path, "dpo_modeling.pq"), index=False)

# Prepare Classification Modeling Dataset
df = load_dataset("stanfordnlp/imdb")["train"].to_pandas()
df.to_parquet(os.path.join(path, "classification_modeling.pq"), index=False)

# Prepare Regression Modeling Dataset
df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas()
df.to_parquet(os.path.join(path, "regression_modeling.pq"), index=False)


if __name__ == "__main__":
download_default_datasets_to_local_folder()
134 changes: 37 additions & 97 deletions llm_studio/app_utils/default_datasets.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import os
import random
import re
import uuid

import pandas as pd
from datasets import load_dataset
from tqdm import tqdm


def prepare_default_dataset_causal_language_modeling(path):
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()

df = pd.concat([train, val], axis=0).reset_index(drop=True)
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"causal_language_modeling.pq",
)
)

df_assistant = df[(df.role == "assistant")].copy()
df_prompter = df[(df.role == "prompter")].copy()
Expand Down Expand Up @@ -54,101 +57,38 @@ def prepare_default_dataset_causal_language_modeling(path):


def prepare_default_dataset_dpo_modeling() -> pd.DataFrame:
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), "dpo_modeling.pq"
)
)
return df


def prepare_default_dataset_classification_modeling() -> pd.DataFrame:
df = load_dataset("stanfordnlp/imdb")["train"].to_pandas()
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("stanfordnlp/imdb")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"classification_modeling.pq",
)
)
return df


def prepare_default_dataset_regression_modeling() -> pd.DataFrame:
df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas()
return df


def extract_anthropic_prompt(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response.rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
return prompt_and_response[: search_term_idx + len(search_term)]


def _parse_row(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response["chosen"].rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)]

chosen_response = prompt_and_response["chosen"][len(prompt) :]
rejected_response = prompt_and_response["rejected"][len(prompt) :]

return prompt, chosen_response, rejected_response


def _split_up_prompt(prompt):
human_texts = re.findall(
r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL
)
assistant_texts = re.findall(
r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL
)
human_texts = [text[0].strip() for text in human_texts]
assistant_texts = [text[0].strip() for text in assistant_texts]

assert len(human_texts) == len(assistant_texts), prompt
dialogue = list(zip(human_texts, assistant_texts))
return dialogue


def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame:
"""
Adapted from
https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py
"""
dataset = load_dataset("Anthropic/hh-rlhf", split=split)
rnd = random.Random()
rnd.seed(123)
dfs = []
for row in tqdm(dataset):
prompt, chosen_response, rejected_response = _parse_row(row)
if len(rejected_response) == 0:
# remove rejected answers that are empty
continue

parent_uuid = None
parsed_texts = []
for human_text, assistant_text in _split_up_prompt(prompt):
random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4))
parsed_texts += [
[human_text, assistant_text, random_uuid, parent_uuid, None, None]
]
parent_uuid = random_uuid

parsed_texts[-1][-2] = chosen_response
parsed_texts[-1][-1] = rejected_response
df = pd.DataFrame(
parsed_texts,
columns=[
"instruction",
"output",
"id",
"parent_id",
"chosen_response",
"rejected_response",
],
if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None:
df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas()
else:
df = pd.read_parquet(
os.path.join(
os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"),
"regression_modeling.pq",
)
)
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
# merge output into chosen and rejected response
df["chosen_response"] = df["chosen_response"].fillna(df["output"])
df["rejected_response"] = df["rejected_response"].fillna(df["output"])
del df["output"]
return df

0 comments on commit 28536fb

Please sign in to comment.