diff --git a/Dockerfile b/Dockerfile index 7965d7294a..ff02cf1e57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,6 +48,12 @@ RUN make setup && chmod -R 777 /workspace/.venv RUN mkdir -p /home/llmstudio/mount ENV H2O_LLM_STUDIO_WORKDIR=/home/llmstudio/mount +# Download the demo datasets and place in the /workspace/demo directory +# Set the environment variable for the demo datasets +ENV H2O_LLM_STUDIO_DEMO_DATASETS=/workspace/demo +COPY download_default_datasets.py /workspace/ +RUN python download_default_datasets.py + COPY . /workspace # Remove unnecessary packages remove build packages again diff --git a/download_default_datasets.py b/download_default_datasets.py new file mode 100644 index 0000000000..0b1ed5d7c8 --- /dev/null +++ b/download_default_datasets.py @@ -0,0 +1,44 @@ +import os + +import pandas as pd +from datasets import load_dataset + + +def download_default_datasets_to_local_folder() -> None: + """ + Downloads the default datasets to a local folder. + + The temporary folder is given by the ENV var H2O_LLM_STUDIO_DEMO_DATASETS. + If the ENV var is not set, this function will raise an error. + The datasets are transformed to parquet format and saved in the folder. + """ + + path = os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") + if path is None: + raise ValueError("H2O_LLM_STUDIO_DEMO_DATASETS is not set.") + + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + + # Prepare Causal Language Modeling Dataset + ds = load_dataset("OpenAssistant/oasst2") + train = ds["train"].to_pandas() + val = ds["validation"].to_pandas() + df = pd.concat([train, val], axis=0).reset_index(drop=True) + df.to_parquet(os.path.join(path, "causal_language_modeling.pq"), index=False) + + # Prepare DPO Modeling Dataset + df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas() + df.to_parquet(os.path.join(path, "dpo_modeling.pq"), index=False) + + # Prepare Classification Modeling Dataset + df = load_dataset("stanfordnlp/imdb")["train"].to_pandas() + df.to_parquet(os.path.join(path, "classification_modeling.pq"), index=False) + + # Prepare Regression Modeling Dataset + df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas() + df.to_parquet(os.path.join(path, "regression_modeling.pq"), index=False) + + +if __name__ == "__main__": + download_default_datasets_to_local_folder() diff --git a/llm_studio/app_utils/default_datasets.py b/llm_studio/app_utils/default_datasets.py index c8314202f2..7201ea9529 100644 --- a/llm_studio/app_utils/default_datasets.py +++ b/llm_studio/app_utils/default_datasets.py @@ -1,19 +1,22 @@ import os -import random -import re -import uuid import pandas as pd from datasets import load_dataset -from tqdm import tqdm def prepare_default_dataset_causal_language_modeling(path): - ds = load_dataset("OpenAssistant/oasst2") - train = ds["train"].to_pandas() - val = ds["validation"].to_pandas() - - df = pd.concat([train, val], axis=0).reset_index(drop=True) + if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None: + ds = load_dataset("OpenAssistant/oasst2") + train = ds["train"].to_pandas() + val = ds["validation"].to_pandas() + df = pd.concat([train, val], axis=0).reset_index(drop=True) + else: + df = pd.read_parquet( + os.path.join( + os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), + "causal_language_modeling.pq", + ) + ) df_assistant = df[(df.role == "assistant")].copy() df_prompter = df[(df.role == "prompter")].copy() @@ -54,101 +57,38 @@ def prepare_default_dataset_causal_language_modeling(path): def prepare_default_dataset_dpo_modeling() -> pd.DataFrame: - df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas() + if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None: + df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas() + else: + df = pd.read_parquet( + os.path.join( + os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), "dpo_modeling.pq" + ) + ) return df def prepare_default_dataset_classification_modeling() -> pd.DataFrame: - df = load_dataset("stanfordnlp/imdb")["train"].to_pandas() + if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None: + df = load_dataset("stanfordnlp/imdb")["train"].to_pandas() + else: + df = pd.read_parquet( + os.path.join( + os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), + "classification_modeling.pq", + ) + ) return df def prepare_default_dataset_regression_modeling() -> pd.DataFrame: - df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas() - return df - - -def extract_anthropic_prompt(prompt_and_response): - """Extract the anthropic prompt from a prompt and response pair.""" - search_term = "\n\nAssistant:" - search_term_idx = prompt_and_response.rfind(search_term) - assert ( - search_term_idx != -1 - ), f"Prompt and response does not contain '{search_term}'" - return prompt_and_response[: search_term_idx + len(search_term)] - - -def _parse_row(prompt_and_response): - """Extract the anthropic prompt from a prompt and response pair.""" - search_term = "\n\nAssistant:" - search_term_idx = prompt_and_response["chosen"].rfind(search_term) - assert ( - search_term_idx != -1 - ), f"Prompt and response does not contain '{search_term}'" - prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)] - - chosen_response = prompt_and_response["chosen"][len(prompt) :] - rejected_response = prompt_and_response["rejected"][len(prompt) :] - - return prompt, chosen_response, rejected_response - - -def _split_up_prompt(prompt): - human_texts = re.findall( - r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL - ) - assistant_texts = re.findall( - r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL - ) - human_texts = [text[0].strip() for text in human_texts] - assistant_texts = [text[0].strip() for text in assistant_texts] - - assert len(human_texts) == len(assistant_texts), prompt - dialogue = list(zip(human_texts, assistant_texts)) - return dialogue - - -def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame: - """ - Adapted from - https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py - """ - dataset = load_dataset("Anthropic/hh-rlhf", split=split) - rnd = random.Random() - rnd.seed(123) - dfs = [] - for row in tqdm(dataset): - prompt, chosen_response, rejected_response = _parse_row(row) - if len(rejected_response) == 0: - # remove rejected answers that are empty - continue - - parent_uuid = None - parsed_texts = [] - for human_text, assistant_text in _split_up_prompt(prompt): - random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4)) - parsed_texts += [ - [human_text, assistant_text, random_uuid, parent_uuid, None, None] - ] - parent_uuid = random_uuid - - parsed_texts[-1][-2] = chosen_response - parsed_texts[-1][-1] = rejected_response - df = pd.DataFrame( - parsed_texts, - columns=[ - "instruction", - "output", - "id", - "parent_id", - "chosen_response", - "rejected_response", - ], + if os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS") is None: + df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas() + else: + df = pd.read_parquet( + os.path.join( + os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS"), + "regression_modeling.pq", + ) ) - dfs.append(df) - df = pd.concat(dfs).reset_index(drop=True) - # merge output into chosen and rejected response - df["chosen_response"] = df["chosen_response"].fillna(df["output"]) - df["rejected_response"] = df["rejected_response"].fillna(df["output"]) - del df["output"] return df