Skip to content

Commit

Permalink
0.0.1 (update)
Browse files Browse the repository at this point in the history
  • Loading branch information
JuanBindez committed Dec 25, 2024
1 parent cc14cf5 commit 02ac5a5
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 0 deletions.
32 changes: 32 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

VERSION=0
MINOR=0
PATCH=1
EXTRAVERSION=""

NOTES="(update)"
BRANCH="main"

if [[ -z $PATCH ]]; then
PATCH=""
else
PATCH=".$PATCH"
fi

if [[ $EXTRAVERSION == *"-rc"* ]]; then
FULL_VERSION="$VERSION.$MINOR$PATCH$EXTRAVERSION"
else

if [[ -z $EXTRAVERSION ]]; then
FULL_VERSION="$VERSION.$MINOR$PATCH"
else
FULL_VERSION="$VERSION.$MINOR$PATCH.$EXTRAVERSION"
fi
fi

git add .
git commit -m "$FULL_VERSION $NOTES"
git push -u origin $BRANCH
git tag v$FULL_VERSION
git push --tags
11 changes: 11 additions & 0 deletions download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from google.colab import files
import shutil

# Substitua 'nome_da_pasta' pelo caminho da pasta que você quer baixar
pasta_para_zipar = '.gpt2-finetuned'
file_zip = 'ctesibioAI-model.zip'


shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)

files.download(file_zip)
25 changes: 25 additions & 0 deletions prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./ctesibioAI-model")
tokenizer = GPT2Tokenizer.from_pretrained("./ctesibioAI-model")

tokenizer.pad_token = "<PAD>"
tokenizer.bos_token = "<BOS>"
tokenizer.eos_token = "<EOS>"

input_text = "<BOS>capital do brasil?"
inputs = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(
inputs,
max_length=50,
num_return_sequences=1,
pad_token_id=tokenizer.pad_token_id, # Garantir consistência com o treinamento
temperature=0.7, # Controle de aleatoriedade
top_k=50, # Considerar apenas os 50 tokens mais prováveis
repetition_penalty=2.0, # Penalizar repetições
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Ctesibio-model Text response:")
print(generated_text)
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
transformers
datasets
torch
61 changes: 61 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

special_tokens = {"pad_token": "<PAD>", "bos_token": "<BOS>", "eos_token": "<EOS>"}
if tokenizer.pad_token is None:
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# datas example
data = [
{"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
{"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
{"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
]

formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in data]

dataset = Dataset.from_list(formatted_data)

def tokenize_function(example):
encoding = tokenizer(
example["text"],
padding="max_length",
truncation=True,
max_length=128
)
encoding["labels"] = encoding["input_ids"].copy()
return encoding

tokenized_dataset = dataset.map(tokenize_function, batched=True)


training_args = TrainingArguments(
output_dir="./ctesibioAI-model",
overwrite_output_dir=True,
per_device_train_batch_size=2,
num_train_epochs=50,
save_steps=500,
save_total_limit=2,
logging_dir="./logs",
logging_steps=10,
report_to=[], # Evita integração com wandb ou outros sistemas
evaluation_strategy="no", # Desabilita avaliação durante o treinamento
)


trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer
)

# save model
trainer.train()
model.save_pretrained("./ctesibioAI-model")
tokenizer.save_pretrained("./ctesibioAI-model")

0 comments on commit 02ac5a5

Please sign in to comment.