-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
02ac5a5
commit e91cab8
Showing
7 changed files
with
296 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
|
||
VERSION=0 | ||
MINOR=0 | ||
PATCH=1 | ||
PATCH=2 | ||
EXTRAVERSION="" | ||
|
||
NOTES="(update)" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# CtesibioAI v0.0.2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### CtesibioAI comes with this file to be used in Colab, so you can train your model using the computational power of Colab" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### follow the steps to do your training\n", | ||
"\n", | ||
"* Install the requirements\n", | ||
"* First you need to pass the data with questions and answers\n", | ||
"* Train the model\n", | ||
"* Test the model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"pip install transformers datasets torch" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### You need to pass the data with questions and answers" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"datas = [\n", | ||
" {\"pergunta\": \"Qual é a capital do Brasil?\", \"resposta\": \"A capital do Brasil é Brasília.\"},\n", | ||
" {\"pergunta\": \"Quem descobriu o Brasil?\", \"resposta\": \"O Brasil foi descoberto por Pedro Álvares Cabral.\"},\n", | ||
" {\"pergunta\": \"Qual é a maior floresta tropical do mundo?\", \"resposta\": \"A maior floresta tropical do mundo é a Floresta Amazônica.\"},\n", | ||
"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Train the model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments\n", | ||
"from datasets import Dataset\n", | ||
"\n", | ||
"from training_data import datas\n", | ||
"\n", | ||
"model_name = \"gpt2\"\n", | ||
"tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", | ||
"model = GPT2LMHeadModel.from_pretrained(model_name)\n", | ||
"\n", | ||
"special_tokens = {\"pad_token\": \"<PAD>\", \"bos_token\": \"<BOS>\", \"eos_token\": \"<EOS>\"}\n", | ||
"if tokenizer.pad_token is None:\n", | ||
" tokenizer.add_special_tokens(special_tokens)\n", | ||
" model.resize_token_embeddings(len(tokenizer))\n", | ||
"\n", | ||
"formatted_data = [{\"text\": f\"<BOS>{d['pergunta']} {d['resposta']}<EOS>\"} for d in datas]\n", | ||
"\n", | ||
"dataset = Dataset.from_list(formatted_data)\n", | ||
"\n", | ||
"def tokenize_function(example):\n", | ||
" encoding = tokenizer(\n", | ||
" example[\"text\"],\n", | ||
" padding=\"max_length\",\n", | ||
" truncation=True,\n", | ||
" max_length=128\n", | ||
" )\n", | ||
" encoding[\"labels\"] = encoding[\"input_ids\"].copy()\n", | ||
" return encoding\n", | ||
"\n", | ||
"tokenized_dataset = dataset.map(tokenize_function, batched=True)\n", | ||
"\n", | ||
"\n", | ||
"training_args = TrainingArguments(\n", | ||
" output_dir=\"./ctesibioAI-model\",\n", | ||
" overwrite_output_dir=True,\n", | ||
" per_device_train_batch_size=2,\n", | ||
" num_train_epochs=50,\n", | ||
" save_steps=500,\n", | ||
" save_total_limit=2,\n", | ||
" logging_dir=\"./logs\",\n", | ||
" logging_steps=10,\n", | ||
" report_to=[], # Evita integração com wandb ou outros sistemas\n", | ||
" evaluation_strategy=\"no\", # Desabilita avaliação durante o treinamento\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"trainer = Trainer(\n", | ||
" model=model,\n", | ||
" args=training_args,\n", | ||
" train_dataset=tokenized_dataset,\n", | ||
" tokenizer=tokenizer\n", | ||
")\n", | ||
"\n", | ||
"# save model\n", | ||
"trainer.train()\n", | ||
"model.save_pretrained(\"./ctesibioAI-model\")\n", | ||
"tokenizer.save_pretrained(\"./ctesibioAI-model\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Test the model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import GPT2LMHeadModel, GPT2Tokenizer\n", | ||
"\n", | ||
"model = GPT2LMHeadModel.from_pretrained(\"./ctesibioAI-model\")\n", | ||
"tokenizer = GPT2Tokenizer.from_pretrained(\"./ctesibioAI-model\")\n", | ||
"\n", | ||
"tokenizer.pad_token = \"<PAD>\"\n", | ||
"tokenizer.bos_token = \"<BOS>\"\n", | ||
"tokenizer.eos_token = \"<EOS>\"\n", | ||
"\n", | ||
"input_text = \"<BOS>capital do brasil?\" \n", | ||
"inputs = tokenizer.encode(input_text, return_tensors=\"pt\")\n", | ||
"\n", | ||
"output = model.generate(\n", | ||
" inputs,\n", | ||
" max_length=50,\n", | ||
" num_return_sequences=1,\n", | ||
" pad_token_id=tokenizer.pad_token_id, # Garantir consistência com o treinamento\n", | ||
" temperature=0.7, # Controle de aleatoriedade\n", | ||
" top_k=50, # Considerar apenas os 50 tokens mais prováveis\n", | ||
" repetition_penalty=2.0, # Penalizar repetições\n", | ||
")\n", | ||
"\n", | ||
"generated_text = tokenizer.decode(output[0], skip_special_tokens=True)\n", | ||
"print(\"Ctesibio Response:\")\n", | ||
"print(generated_text)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Now if you want you can download the model to use wherever you want" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "plaintext" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"\"\"\"This piece of code is only used to download the model to your machine through Colab\"\"\"\n", | ||
"\n", | ||
"from google.colab import files\n", | ||
"import shutil\n", | ||
"\n", | ||
"pasta_para_zipar = '.ctesibioAI-model'\n", | ||
"file_zip = 'ctesibioAI-model.zip' \n", | ||
"\n", | ||
"\n", | ||
"shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)\n", | ||
"\n", | ||
"files.download(file_zip)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from transformers import GPT2LMHeadModel, GPT2Tokenizer | ||
|
||
# Load the model and tokenizer | ||
model = GPT2LMHeadModel.from_pretrained("./ctesibioAI-model") | ||
tokenizer = GPT2Tokenizer.from_pretrained("./ctesibioAI-model") | ||
|
||
# Configure special tokens | ||
tokenizer.pad_token = "<PAD>" | ||
tokenizer.bos_token = "<BOS>" | ||
tokenizer.eos_token = "<EOS>" | ||
|
||
# Initialize the conversation history | ||
history = "<BOS>" | ||
|
||
print("Ctesibio AI - Interactive Chatbot Test") | ||
print("Type 'exit' to end the conversation.\n") | ||
|
||
while True: | ||
# User input | ||
user_input = input("You: ") | ||
if user_input.lower() in ["exit"]: | ||
print("Ending the conversation. Goodbye!") | ||
break | ||
|
||
# Update the history with the user's input | ||
history += f"{user_input}<EOS>" | ||
|
||
# Tokenize the history | ||
inputs = tokenizer.encode(history, return_tensors="pt") | ||
|
||
# Generate the model's response | ||
outputs = model.generate( | ||
inputs, | ||
max_length=100, | ||
num_return_sequences=1, | ||
pad_token_id=tokenizer.pad_token_id, # Ensure consistency with training | ||
temperature=0.7, # Control randomness | ||
top_k=50, # Consider only the top 50 most likely tokens | ||
repetition_penalty=2.0, # Penalize repetitions | ||
) | ||
|
||
# Decode and display the response | ||
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() | ||
print("Ctesibio:", response) | ||
|
||
# Update the history with the model's response | ||
history += f"{response}<EOS>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
|
||
datas = [ | ||
{"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."}, | ||
{"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."}, | ||
{"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."}, | ||
] |