Skip to content

Commit

Permalink
Merge pull request #5 from Sense/bug_dataset_fix
Browse files Browse the repository at this point in the history
Added dataset bug fix. Where data will be taken of 1st adapter only
  • Loading branch information
jev-cloudera authored and GitHub Enterprise committed Oct 28, 2024
2 parents 5c7375f + da82387 commit 3cd4e11
Show file tree
Hide file tree
Showing 12 changed files with 82 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .project-metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ environment_variables:
runtimes:
- editor: JupyterLab
kernel: Python 3.9
kernel: Python 3.11
edition: Nvidia GPU

tasks:
Expand Down
12 changes: 8 additions & 4 deletions ft/eval/mlflow_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def driver(
prompt_id: str = None,
selected_features: List[str] = None,
eval_dataset_fraction: float = None,
comparison_adapter_id: str = None,
client: FineTuningStudioClient = None):

# TODO: remove hard-coded dependencies on GPU driver for evals
Expand All @@ -42,15 +43,18 @@ def driver(
generation_config_dict = json.loads(client.GetConfig(GetConfigRequest(
id=generation_config_id)).config.config) if generation_config_id else None
if adapter_id != BASE_MODEL_ONLY_ADAPTER_ID:
dataset_split: GetDatasetSplitByAdapterMetadata = client.GetDatasetSplitByAdapter(
GetDatasetSplitByAdapterRequest(adapter_id=adapter_id)).response
adapter: AdapterMetadata = client.GetAdapter(GetAdapterRequest(id=adapter_id)).adapter
else:
adapter: AdapterMetadata = AdapterMetadata(type=AdapterType.PROJECT, location=BASE_MODEL_ONLY_ADAPTER_LOCATION)
# Load dataset
if comparison_adapter_id != BASE_MODEL_ONLY_ADAPTER_ID:
dataset_split: GetDatasetSplitByAdapterMetadata = client.GetDatasetSplitByAdapter(
GetDatasetSplitByAdapterRequest(adapter_id=comparison_adapter_id)).response
else:
# as this is only base model evaluation, no need to do any splitting as all data is unseen
dataset_split = GetDatasetSplitByAdapterMetadata(
dataset_fraction=0.2, train_test_split=0.2) # make them variables
adapter: AdapterMetadata = AdapterMetadata(type=AdapterType.PROJECT, location=BASE_MODEL_ONLY_ADAPTER_LOCATION)
# Load dataset

eval_dataset, eval_column_name = dataloader.fetch_evaluation_dataset(
dataset_id, client=client, prompt_metadata=prompt, dataset_split=dataset_split, selected_features=selected_features, eval_dataset_fraction=eval_dataset_fraction)
# Load Model Pipeline
Expand Down
16 changes: 16 additions & 0 deletions ft/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ def _validate_start_evaluation_job_request(request: StartEvaluationJobRequest, d
raise ValueError(f"Generation Config with ID '{request.generation_config_id}' does not exist.")


def get_comparison_adapter_id(model_adapter_combinations):
"""
Given a list of ModelAdapterCombination, return the comparison adapter ID.
"""
comparison_adapter_id = BASE_MODEL_ONLY_ADAPTER_ID
for model_adapter_combo in model_adapter_combinations:
if model_adapter_combo.adapter_id != BASE_MODEL_ONLY_ADAPTER_ID:
comparison_adapter_id = model_adapter_combo.adapter_id
break
return comparison_adapter_id


def start_evaluation_job(request: StartEvaluationJobRequest,
cml: CMLServiceApi = None, dao: FineTuningStudioDao = None) -> StartEvaluationJobResponse:
"""
Expand All @@ -120,6 +132,7 @@ def start_evaluation_job(request: StartEvaluationJobRequest,
# TODO: pull this and others into app state
project_id = os.getenv("CDSW_PROJECT_ID")
parent_job_id = str(uuid4())
comparison_adapter_id = get_comparison_adapter_id(request.model_adapter_combinations)
for idx, model_adapter_combo in enumerate(request.model_adapter_combinations):
job_id = str(uuid4())
job_dir = f".app/mlflow_job_runs/{job_id}"
Expand Down Expand Up @@ -174,6 +187,9 @@ def start_evaluation_job(request: StartEvaluationJobRequest,
arg_list.append("--eval_dataset_fraction")
arg_list.append(request.eval_dataset_fraction)

arg_list.append("--comparison_adapter_id")
arg_list.append(comparison_adapter_id)

cpu = request.cpu
gpu = request.gpu
memory = request.memory
Expand Down
2 changes: 1 addition & 1 deletion ft/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def deploy_cml_model(request: ExportModelRequest,

response = ExportModelResponse()
job_id = str(uuid4())

# CML model export requires a HF model and a project-specific adapter.
base_model_hf_name = None
adapter_location = None
Expand Down
3 changes: 3 additions & 0 deletions ft/proto/fine_tuning_studio.proto
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,9 @@ message StartEvaluationJobRequest {
// Fraction of dataset to be used for evaluation
float eval_dataset_fraction = 20;

// Adapter Id for to be used for figuring out data to be used
string comparison_adapter_id = 21;


}
message StartEvaluationJobResponse {
Expand Down
80 changes: 40 additions & 40 deletions ft/proto/fine_tuning_studio_pb2.py

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions ft/proto/fine_tuning_studio_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,8 @@ class StartEvaluationJobRequest(_message.Message):
"prompt_id",
"gpu_label_id",
"selected_features",
"eval_dataset_fraction")
"eval_dataset_fraction",
"comparison_adapter_id")
TYPE_FIELD_NUMBER: _ClassVar[int]
MODEL_ADAPTER_COMBINATIONS_FIELD_NUMBER: _ClassVar[int]
DATASET_ID_FIELD_NUMBER: _ClassVar[int]
Expand All @@ -524,6 +525,7 @@ class StartEvaluationJobRequest(_message.Message):
GPU_LABEL_ID_FIELD_NUMBER: _ClassVar[int]
SELECTED_FEATURES_FIELD_NUMBER: _ClassVar[int]
EVAL_DATASET_FRACTION_FIELD_NUMBER: _ClassVar[int]
COMPARISON_ADAPTER_ID_FIELD_NUMBER: _ClassVar[int]
type: str
model_adapter_combinations: _containers.RepeatedCompositeFieldContainer[EvaluationJobModelCombination]
dataset_id: str
Expand All @@ -537,6 +539,7 @@ class StartEvaluationJobRequest(_message.Message):
gpu_label_id: int
selected_features: _containers.RepeatedScalarFieldContainer[str]
eval_dataset_fraction: float
comparison_adapter_id: str

def __init__(self,
type: _Optional[str] = ...,
Expand All @@ -552,7 +555,8 @@ class StartEvaluationJobRequest(_message.Message):
prompt_id: _Optional[str] = ...,
gpu_label_id: _Optional[int] = ...,
selected_features: _Optional[_Iterable[str]] = ...,
eval_dataset_fraction: _Optional[float] = ...) -> None: ...
eval_dataset_fraction: _Optional[float] = ...,
comparison_adapter_id: _Optional[str] = ...) -> None: ...


class StartEvaluationJobResponse(_message.Message):
Expand Down
2 changes: 1 addition & 1 deletion ft/scripts/cml_model_predict_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
mlt = MLFlowTransformers()

# Main script used as the prediction/generation base of a deployed CML Model. This script
# globally loads a model and an adapter onto a CUDA-compatible GPU. The model exposes
# globally loads a model and an adapter onto a CUDA-compatible GPU. The model exposes
# an "api_wrapper" function that expects a dictionary payload of an input prompt, as well
# as generation arguments.

Expand Down
2 changes: 2 additions & 0 deletions ft/scripts/mlflow_evaluation_base_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
parser.add_argument("--selected_features", help="Names of the columns to be shown in the evaluation csv", default=None)
parser.add_argument("--eval_dataset_fraction", type=float, default=EVAL_DATASET_DEFAULT_FRACTION,
help="Percentage of eval dataset to be used for evaluation")
parser.add_argument("--comparison_adapter_id", help="ID of the adapter to be compared", default=None)

args = parser.parse_args(arg_string.split())

Expand All @@ -50,6 +51,7 @@
generation_config_id=args.generation_config_id,
selected_features=args.selected_features,
eval_dataset_fraction=args.eval_dataset_fraction,
comparison_adapter_id=args.comparison_adapter_id,
client=client
)
print(response.metrics)
Expand Down
2 changes: 1 addition & 1 deletion requirements-inference.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Requirements file for model inference as a CML model. Installed via cdsw-build.sh.

setuptools==58.1.0
setuptools==74.1.0
torch==2.3.1
bitsandbytes==0.43.3
accelerate==0.33.0
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ torchvision==0.18.1
# Disabling axolotl until we finish implementation
# -e git+https://github.com/axolotl-ai-cloud/axolotl@f07802f9fa9ae95f0b37ce626eaf21eca9fce738#egg=axolotl

setuptools==58.1.0
setuptools==74.1.0
bitsandbytes==0.43.3
datasets==2.20.0
accelerate==0.33.0
Expand Down
3 changes: 2 additions & 1 deletion tests/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ class MockCMLListJobsResponse:
def __init__(self, jobs: List[MockCMLCreatedJob] = []):
self.jobs = jobs


class MockPath:
def __init__(self, path_name: str = None):
self.path_name = path_name

def mkdir(self, **kwargs):
return

Expand Down

0 comments on commit 3cd4e11

Please sign in to comment.