0.0.2 (update)

JuanBindez · Dec 25, 2024 · e91cab8 · e91cab8
1 parent 02ac5a5
commit e91cab8
Show file tree

Hide file tree

Showing 7 changed files with 296 additions and 13 deletions.
diff --git a/build.sh b/build.sh
@@ -2,7 +2,7 @@
 
 VERSION=0
 MINOR=0
-PATCH=1
+PATCH=2
 EXTRAVERSION=""
 
 NOTES="(update)"

diff --git a/ctesibioAI_colab.ipynb b/ctesibioAI_colab.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CtesibioAI v0.0.2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CtesibioAI comes with this file to be used in Colab, so you can train your model using the computational power of Colab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### follow the steps to do your training\n",
+    "\n",
+    "* Install the requirements\n",
+    "* First you need to pass the data with questions and answers\n",
+    "* Train the model\n",
+    "* Test the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "pip install transformers datasets torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### You need to pass the data with questions and answers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "datas = [\n",
+    "    {\"pergunta\": \"Qual é a capital do Brasil?\", \"resposta\": \"A capital do Brasil é Brasília.\"},\n",
+    "    {\"pergunta\": \"Quem descobriu o Brasil?\", \"resposta\": \"O Brasil foi descoberto por Pedro Álvares Cabral.\"},\n",
+    "    {\"pergunta\": \"Qual é a maior floresta tropical do mundo?\", \"resposta\": \"A maior floresta tropical do mundo é a Floresta Amazônica.\"},\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments\n",
+    "from datasets import Dataset\n",
+    "\n",
+    "from training_data import datas\n",
+    "\n",
+    "model_name = \"gpt2\"\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n",
+    "model = GPT2LMHeadModel.from_pretrained(model_name)\n",
+    "\n",
+    "special_tokens = {\"pad_token\": \"<PAD>\", \"bos_token\": \"<BOS>\", \"eos_token\": \"<EOS>\"}\n",
+    "if tokenizer.pad_token is None:\n",
+    "    tokenizer.add_special_tokens(special_tokens)\n",
+    "    model.resize_token_embeddings(len(tokenizer))\n",
+    "\n",
+    "formatted_data = [{\"text\": f\"<BOS>{d['pergunta']} {d['resposta']}<EOS>\"} for d in datas]\n",
+    "\n",
+    "dataset = Dataset.from_list(formatted_data)\n",
+    "\n",
+    "def tokenize_function(example):\n",
+    "    encoding = tokenizer(\n",
+    "        example[\"text\"],\n",
+    "        padding=\"max_length\",\n",
+    "        truncation=True,\n",
+    "        max_length=128\n",
+    "    )\n",
+    "    encoding[\"labels\"] = encoding[\"input_ids\"].copy()\n",
+    "    return encoding\n",
+    "\n",
+    "tokenized_dataset = dataset.map(tokenize_function, batched=True)\n",
+    "\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./ctesibioAI-model\",\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=2,\n",
+    "    num_train_epochs=50,\n",
+    "    save_steps=500,\n",
+    "    save_total_limit=2,\n",
+    "    logging_dir=\"./logs\",\n",
+    "    logging_steps=10,\n",
+    "    report_to=[],  # Evita integração com wandb ou outros sistemas\n",
+    "    evaluation_strategy=\"no\",  # Desabilita avaliação durante o treinamento\n",
+    ")\n",
+    "\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset,\n",
+    "    tokenizer=tokenizer\n",
+    ")\n",
+    "\n",
+    "# save model\n",
+    "trainer.train()\n",
+    "model.save_pretrained(\"./ctesibioAI-model\")\n",
+    "tokenizer.save_pretrained(\"./ctesibioAI-model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
+    "\n",
+    "model = GPT2LMHeadModel.from_pretrained(\"./ctesibioAI-model\")\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(\"./ctesibioAI-model\")\n",
+    "\n",
+    "tokenizer.pad_token = \"<PAD>\"\n",
+    "tokenizer.bos_token = \"<BOS>\"\n",
+    "tokenizer.eos_token = \"<EOS>\"\n",
+    "\n",
+    "input_text = \"<BOS>capital do brasil?\" \n",
+    "inputs = tokenizer.encode(input_text, return_tensors=\"pt\")\n",
+    "\n",
+    "output = model.generate(\n",
+    "    inputs,\n",
+    "    max_length=50,\n",
+    "    num_return_sequences=1,\n",
+    "    pad_token_id=tokenizer.pad_token_id,  # Garantir consistência com o treinamento\n",
+    "    temperature=0.7,  # Controle de aleatoriedade\n",
+    "    top_k=50,         # Considerar apenas os 50 tokens mais prováveis\n",
+    "    repetition_penalty=2.0,  # Penalizar repetições\n",
+    ")\n",
+    "\n",
+    "generated_text = tokenizer.decode(output[0], skip_special_tokens=True)\n",
+    "print(\"Ctesibio Response:\")\n",
+    "print(generated_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now if you want you can download the model to use wherever you want"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"This piece of code is only used to download the model to your machine through Colab\"\"\"\n",
+    "\n",
+    "from google.colab import files\n",
+    "import shutil\n",
+    "\n",
+    "pasta_para_zipar = '.ctesibioAI-model'\n",
+    "file_zip = 'ctesibioAI-model.zip' \n",
+    "\n",
+    "\n",
+    "shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)\n",
+    "\n",
+    "files.download(file_zip)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/download_model.py b/download_model.py
@@ -1,9 +1,8 @@
 from google.colab import files
 import shutil
 
-# Substitua 'nome_da_pasta' pelo caminho da pasta que você quer baixar
-pasta_para_zipar = '.gpt2-finetuned'
-file_zip = 'ctesibioAI-model.zip'
+pasta_para_zipar = '.ctesibioAI-model'
+file_zip = 'ctesibioAI-model.zip' 
 
 
 shutil.make_archive(base_name=file_zip.replace('.zip', ''), format='zip', root_dir=pasta_para_zipar)

diff --git a/promp2.py b/promp2.py
@@ -0,0 +1,47 @@
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+
+# Load the model and tokenizer
+model = GPT2LMHeadModel.from_pretrained("./ctesibioAI-model")
+tokenizer = GPT2Tokenizer.from_pretrained("./ctesibioAI-model")
+
+# Configure special tokens
+tokenizer.pad_token = "<PAD>"
+tokenizer.bos_token = "<BOS>"
+tokenizer.eos_token = "<EOS>"
+
+# Initialize the conversation history
+history = "<BOS>"
+
+print("Ctesibio AI - Interactive Chatbot Test")
+print("Type 'exit' to end the conversation.\n")
+
+while True:
+    # User input
+    user_input = input("You: ")
+    if user_input.lower() in ["exit"]:
+        print("Ending the conversation. Goodbye!")
+        break
+
+    # Update the history with the user's input
+    history += f"{user_input}<EOS>"
+
+    # Tokenize the history
+    inputs = tokenizer.encode(history, return_tensors="pt")
+
+    # Generate the model's response
+    outputs = model.generate(
+        inputs,
+        max_length=100,
+        num_return_sequences=1,
+        pad_token_id=tokenizer.pad_token_id,  # Ensure consistency with training
+        temperature=0.7,  # Control randomness
+        top_k=50,         # Consider only the top 50 most likely tokens
+        repetition_penalty=2.0,  # Penalize repetitions
+    )
+
+    # Decode and display the response
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    print("Ctesibio:", response)
+
+    # Update the history with the model's response
+    history += f"{response}<EOS>"
diff --git a/prompt.py b/prompt.py
@@ -21,5 +21,5 @@
 )
 
 generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-print("Ctesibio-model Text response:")
+print("Ctesibio Response:")
 print(generated_text)
diff --git a/train.py → training.py b/train.py → training.py
@@ -1,6 +1,14 @@
 from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
 from datasets import Dataset
 
+
+#Here you pass the questions and answers to train your model
+datas = [
+    {"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
+    {"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
+    {"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
+]
+
 model_name = "gpt2"
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
 model = GPT2LMHeadModel.from_pretrained(model_name)
@@ -10,14 +18,7 @@
     tokenizer.add_special_tokens(special_tokens)
     model.resize_token_embeddings(len(tokenizer))
 
-# datas example
-data = [
-    {"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
-    {"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
-    {"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
-]
-
-formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in data]
+formatted_data = [{"text": f"<BOS>{d['pergunta']} {d['resposta']}<EOS>"} for d in datas]
 
 dataset = Dataset.from_list(formatted_data)
 

diff --git a/training_data.py b/training_data.py
@@ -0,0 +1,7 @@
+
+
+datas = [
+    {"pergunta": "Qual é a capital do Brasil?", "resposta": "A capital do Brasil é Brasília."},
+    {"pergunta": "Quem descobriu o Brasil?", "resposta": "O Brasil foi descoberto por Pedro Álvares Cabral."},
+    {"pergunta": "Qual é a maior floresta tropical do mundo?", "resposta": "A maior floresta tropical do mundo é a Floresta Amazônica."},
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ @@
     VERSION=0
     MINOR=0
-    PATCH=1
+    PATCH=2
     EXTRAVERSION=""
     NOTES="(update)"
@@ Expand Down @@