Skip to content

Commit

Permalink
0.0.2 (update)
Browse files Browse the repository at this point in the history
  • Loading branch information
JuanBindez committed Dec 25, 2024
1 parent 02ac5a5 commit e91cab8
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 13 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

VERSION=0
MINOR=0
PATCH=1
PATCH=2
EXTRAVERSION=""

NOTES="(update)"
Expand Down
229 changes: 229 additions & 0 deletions ctesibioAI_colab.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CtesibioAI v0.0.2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### CtesibioAI comes with this file to be used in Colab, so you can train your model using the computational power of Colab"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### follow the steps to do your training\n",
"\n",
"* Install the requirements\n",
"* First you need to pass the data with questions and answers\n",
"* Train the model\n",
"* Test the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"pip install transformers datasets torch"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### You need to pass the data with questions and answers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"datas = [\n",
" {\"pergunta\": \"Qual é a capital do Brasil?\", \"resposta\": \"A capital do Brasil é Brasília.\"},\n",
" {\"pergunta\": \"Quem descobriu o Brasil?\", \"resposta\": \"O Brasil foi descoberto por Pedro Álvares Cabral.\"},\n",
" {\"pergunta\": \"Qual é a maior floresta tropical do mundo?\", \"resposta\": \"A maior floresta tropical do mundo é a Floresta Amazônica.\"},\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments\n",
"from datasets import Dataset\n",
"\n",
"from training_data import datas\n",
"\n",
"model_name = \"gpt2\"\n",
"tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n",
"model = GPT2LMHeadModel.from_pretrained(model_name)\n",
"\n",
"special_tokens = {\"pad_token\": \"<PAD>\", \"bos_token\": \"<BOS>\", \"eos_token\": \"<EOS>\"}\n",
"if tokenizer.pad_token is None:\n",
" tokenizer.add_special_tokens(special_tokens)\n",
" model.resize_token_embeddings(len(tokenizer))\n",
"\n",
"formatted_data = [{\"text\": f\"<BOS>{d['pergunta']} {d['resposta']}<EOS>\"} for d in datas]\n",
"\n",
"dataset = Dataset.from_list(formatted_data)\n",
"\n",
"def tokenize_function(example):\n",
" encoding = tokenizer(\n",
" example[\"text\"],\n",
" padding=\"max_length\",\n",
" truncation=True,\n",
" max_length=128\n",
" )\n",
" encoding[\"labels\"] = encoding[\"input_ids\"].copy()\n",
" return encoding\n",
"\n",
"tokenized_dataset = dataset.map(tokenize_function, batched=True)\n",
"\n",
"\n",
"training_args = TrainingArguments(\n",
" output_dir=\"./ctesibioAI-model\",\n",
" overwrite_output_dir=True,\n",
" per_device_train_batch_size=2,\n",
" num_train_epochs=50,\n",
" save_steps=500,\n",
" save_total_limit=2,\n",
" logging_dir=\"./logs\",\n",
" logging_steps=10,\n",
" report_to=[], # Evita integração com wandb ou outros sistemas\n",
" evaluation_strategy=\"no\", # Desabilita avaliação durante o treinamento\n",
")\n",
"\n",
"\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=tokenized_dataset,\n",
" tokenizer=tokenizer\n",
")\n",
"\n",
"# save model\n",
"trainer.train()\n",
"model.save_pretrained(\"./ctesibioAI-model\")\n",
"tokenizer.save_pretrained(\"./ctesibioAI-model\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
"\n",
"model = GPT2LMHeadModel.from_pretrained(\"./ctesibioAI-model\")\n",
"tokenizer = GPT2Tokenizer.from_pretrained(\"./ctesibioAI-model\")\n",
"\n",
"tokenizer.pad_token = \"<PAD>\"\n",
"tokenizer.bos_token = \"<BOS>\"\n",
"tokenizer.eos_token = \"<EOS>\"\n",
"\n",
"input_text = \"<BOS>capital do brasil?\" \n",
"inputs = tokenizer.encode(input_text, return_tensors=\"pt\")\n",
"\n",
"output = model.generate(\n",
" inputs,\n",
" max_length=50,\n",
" num_return_sequences=1,\n",
" pad_token_id=tokenizer.pad_token_id, # Garantir consistência com o treinamento\n",
" temperature=0.7, # Controle de aleatoriedade\n",
" top_k=50, # Considerar apenas os 50 tokens mais prováveis\n",
" repetition_penalty=2.0, # Penalizar repetições\n",
")\n",
"\n",
"generated_text = tokenizer.decode(output[0], skip_special_tokens=True)\n",
"print(\"Ctesibio Response:\")\n",
"print(generated_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Now if you want you can download the model to use wherever you want"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"\"\"\"This piece of code is only used to download the model to your machine through Colab\"\"\"\n",
"\n",
"from google.colab import files\n",
"import shutil\n",
"\n",
"pasta_para_zipar = '.ctesibioAI-model'\n",
"file_zip = 'ctesibioAI-model.zip' \n",
"\n",
"\n",
"shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)\n",
"\n",
"files.download(file_zip)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 2 additions & 3 deletions download_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from google.colab import files
import shutil

# Substitua 'nome_da_pasta' pelo caminho da pasta que você quer baixar
pasta_para_zipar = '.gpt2-finetuned'
file_zip = 'ctesibioAI-model.zip'
pasta_para_zipar = '.ctesibioAI-model'
file_zip = 'ctesibioAI-model.zip'


shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)
Expand Down
47 changes: 47 additions & 0 deletions promp2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./ctesibioAI-model")
tokenizer = GPT2Tokenizer.from_pretrained("./ctesibioAI-model")

# Configure special tokens
tokenizer.pad_token = "<PAD>"
tokenizer.bos_token = "<BOS>"
tokenizer.eos_token = "<EOS>"

# Initialize the conversation history
history = "<BOS>"

print("Ctesibio AI - Interactive Chatbot Test")
print("Type 'exit' to end the conversation.\n")

while True:
# User input
user_input = input("You: ")
if user_input.lower() in ["exit"]:
print("Ending the conversation. Goodbye!")
break

# Update the history with the user's input
history += f"{user_input}<EOS>"

# Tokenize the history
inputs = tokenizer.encode(history, return_tensors="pt")

# Generate the model's response
outputs = model.generate(
inputs,
max_length=100,
num_return_sequences=1,
pad_token_id=tokenizer.pad_token_id, # Ensure consistency with training
temperature=0.7, # Control randomness
top_k=50, # Consider only the top 50 most likely tokens
repetition_penalty=2.0, # Penalize repetitions
)

# Decode and display the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print("Ctesibio:", response)

# Update the history with the model's response
history += f"{response}<EOS>"
2 changes: 1 addition & 1 deletion prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,5 @@
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Ctesibio-model Text response:")
print("Ctesibio Response:")
print(generated_text)
17 changes: 9 additions & 8 deletions train.py → training.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset


#Here you pass the questions and answers to train your model
datas = [
{"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
{"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
{"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
]

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
Expand All @@ -10,14 +18,7 @@
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# datas example
data = [
{"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
{"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
{"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
]

formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in data]
formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in datas]

dataset = Dataset.from_list(formatted_data)

Expand Down
7 changes: 7 additions & 0 deletions training_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@


datas = [
{"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
{"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
{"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
]

0 comments on commit e91cab8

Please sign in to comment.