Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
elamraniadnane1 committed May 19, 2024
1 parent 1607e81 commit ad2a88d
Show file tree
Hide file tree
Showing 7 changed files with 588 additions and 207 deletions.
7 changes: 4 additions & 3 deletions scripts/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
app = FastAPI()

# Assuming static files are also in the scripts directory or a subdirectory therein
app.mount("/scripts", StaticFiles(directory="C:\\Users\\LENOVO\\Desktop\\CSC5356_SP24\\scripts"), name="scripts")
app.mount("/scripts", StaticFiles(directory="C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts"), name="scripts")

# Setup for templates
templates = Jinja2Templates(directory="C:\\Users\\LENOVO\\Desktop\\CSC5356_SP24\\scripts")
templates = Jinja2Templates(directory="C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts")

class Config:
MODEL_PATH = os.getenv("MODEL_PATH", "/app/model")
MODEL_PATH = os.getenv("MODEL_PATH", "C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\bert-election2024-twitter-stance-biden")
CSV_FILE_PATH = os.getenv("CSV_FILE_PATH", "C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\dataset_reduced.csv")

@lru_cache()
def load_model():
Expand Down
4 changes: 2 additions & 2 deletions scripts/dataset_reduced.csv
Git LFS file not shown
120 changes: 53 additions & 67 deletions scripts/export_vertex_ai_data.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,55 @@
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import subprocess
import os
import google.auth
from google.auth.transport.requests import Request
from google.auth import default
from google.auth import exceptions
from google.oauth2.credentials import Credentials
from google.auth.transport.requests import AuthorizedSession
from google.auth import impersonated_credentials
from google.auth.transport import requests

from google.cloud import aiplatform

# Function to authenticate and get the credentials
def authenticate_with_google_cloud():
credentials, project = google.auth.default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
if not credentials.valid:
if credentials.expired and credentials.refresh_token:
credentials.refresh(Request())
else:
# If credentials are not valid and cannot be refreshed, open the browser for authentication
flow = google.auth.oauth2client.OAuth2WebServerFlow(
client_id=os.getenv("GOOGLE_CLIENT_ID"),
client_secret=os.getenv("GOOGLE_CLIENT_SECRET"),
scope="https://www.googleapis.com/auth/cloud-platform",
redirect_uri="urn:ietf:wg:oauth:2.0:oob"
)
auth_uri = flow.step1_get_authorize_url()
print("Please go to this URL: {}".format(auth_uri))
auth_code = input("Enter the authorization code: ")
credentials = flow.step2_exchange(auth_code)

return credentials, project

# Function to export Vertex AI data to a .txt file
def export_vertex_ai_data():
# Authenticate and get the credentials
credentials, project_id = authenticate_with_google_cloud()

# Initialize the AI Platform client
aiplatform.init(project=project_id, credentials=credentials)

# Define the file to save the data
export_file = "vertex_ai_data.txt"

with open(export_file, "w") as file:
# Example: List all datasets in the project
datasets = aiplatform.Dataset.list()
file.write("Datasets:\n")
for dataset in datasets:
file.write(f"Name: {dataset.name}, Display Name: {dataset.display_name}\n")

# Example: List all models in the project
models = aiplatform.Model.list()
file.write("\nModels:\n")
for model in models:
file.write(f"Name: {model.name}, Display Name: {model.display_name}\n")

# Example: List all endpoints in the project
endpoints = aiplatform.Endpoint.list()
file.write("\nEndpoints:\n")
for endpoint in endpoints:
file.write(f"Name: {endpoint.name}, Display Name: {endpoint.display_name}\n")

print(f"Data exported to {export_file}")

if __name__ == "__main__":
export_vertex_ai_data()
MODEL_PATH = 'C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts\\bert-election2024-twitter-stance-biden'
SAVE_PATH = 'C:\\Users\\LENOVO\\Desktop\\saved_model'
HANDLER_PATH = 'C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts\\transformers_handler.py' # Update this path as needed

# Load the model and tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)

# Save the tokenizer and model
tokenizer.save_pretrained(SAVE_PATH)
model.save_pretrained(SAVE_PATH)

# If the model uses PyTorch, save the model as a .bin file
model_file_path = f"{SAVE_PATH}/pytorch_model.bin"
torch.save(model.state_dict(), model_file_path)

# Determine the PyTorch version
torch_version = torch.__version__
print(f"PyTorch version: {torch_version}")

# Prepare the model for TorchServe
# Install torch-model-archiver if not already installed
try:
import torch_model_archiver
except ImportError:
subprocess.run(["pip", "install", "torch-model-archiver"])

# Create model_store directory if it doesn't exist
model_store_path = os.path.join(SAVE_PATH, "model_store")
os.makedirs(model_store_path, exist_ok=True)

# Archive the model
archive_command = [
"torch-model-archiver",
"--model-name", "bert-election2024",
"--version", "1.0",
"--serialized-file", model_file_path,
"--handler", HANDLER_PATH,
"--export-path", model_store_path,
"--extra-files", f"{SAVE_PATH}/config.json,{SAVE_PATH}/vocab.txt",
"--force"
]
subprocess.run(archive_command)

# Verify model archive
archive_file = os.path.join(model_store_path, "bert-election2024.mar")
if os.path.exists(archive_file):
print(f"Model archive created at: {archive_file}")
else:
print("Failed to create model archive.")
Empty file added scripts/files/Dockerfile
Empty file.
326 changes: 326 additions & 0 deletions scripts/final_notebook.ipynb

Large diffs are not rendered by default.

209 changes: 74 additions & 135 deletions scripts/improved_bias_shap_lime_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import shap
import lime
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
import numpy as np
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
import re
import shap
import matplotlib.pyplot as plt

# Load the model and tokenizer
MODEL_PATH = 'C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts\\bert-election2024-twitter-stance-biden'
Expand All @@ -20,142 +18,83 @@
CSV_FILE_PATH = 'C:\\Users\\LENOVO\\Desktop\\CSC5382_SP24_FINALPROJECT\\scripts\\dataset_reduced.csv'
data = pd.read_csv(CSV_FILE_PATH)

# Preprocess data
def preprocess_data(data):
def clean_text(text):
text = re.sub(r'http\S+', '', text)
text = re.sub(r'@\S+|#\S+', '', text)
text = re.sub(r'[^A-Za-z\s]', '', text)
text = text.lower().strip()
return text

data['text'] = data['text'].apply(clean_text)
label_mapping = {'NONE': 0, 'FAVOR': 1, 'AGAINST': 2}
data['label'] = data['label'].map(label_mapping)
return data

data = preprocess_data(data)

# Adjust sample size based on the dataset size
sample_size = min(100, len(data))
subset_data = data.sample(n=sample_size, random_state=42)
texts = subset_data['text'].values
labels = subset_data['label'].values

# Tokenize the texts
encoded_inputs = tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=128,
padding=True,
truncation=True,
return_tensors='pt'
)

# Function to get model predictions
def predict(inputs):
model.eval()
with torch.no_grad():
outputs = model(**inputs)
return outputs.logits
# Map labels to numerical values
label_mapping = {'NONE': 0, 'FAVOR': 1, 'AGAINST': 2}
data['label'] = data['label'].map(label_mapping)

inputs = {
'input_ids': encoded_inputs['input_ids'],
'attention_mask': encoded_inputs['attention_mask']
}
# Binarize the scores: Consider 'FAVOR' (1) as positive, 'NONE' (0) and 'AGAINST' (2) as negative
data['score'] = data['label'].apply(lambda x: 1 if x == 1 else 0)

logits = predict(inputs)
predictions = torch.argmax(logits, dim=1).numpy()
# Prepare data for Aequitas
aequitas_data = pd.DataFrame()
aequitas_data['score'] = data['score']
aequitas_data['label_value'] = data['score'] # Same as score because we already binarized it

# Binarize labels and predictions for Aequitas
binarized_labels = (labels > 0).astype(int) # Binarize: 0 -> 0 (NONE), 1 or 2 -> 1 (FAVOR, AGAINST)
binarized_predictions = (predictions > 0).astype(int)
# Add demographic data (for example purposes, let's assume we have 'demographic' column in your dataset)
# In a real-world scenario, you should replace this with actual demographic data
aequitas_data['attribute'] = data['demographic'] if 'demographic' in data.columns else np.random.choice(['group1', 'group2'], len(data))

# Aequitas Bias and Fairness Assessment
# Group metric calculation
group = Group()
xtab, _ = group.get_crosstabs(aequitas_data)

# Bias calculation
bias = Bias()
bdf = bias.get_disparity_predefined_groups(xtab, original_df=aequitas_data, ref_groups_dict={'attribute': 'group1'}, alpha=0.05, mask_significance=True)

# Fairness calculation
fairness = Fairness()
fdf = fairness.get_group_value_fairness(bdf)

# Prepare data for Aequitas
aequitas_df = pd.DataFrame({
'score': binarized_predictions,
'label_value': binarized_labels
})

print(aequitas_df)

# Add demographic columns if available (e.g., race, gender)
# aequitas_df['race'] = subset_data['race']
# aequitas_df['gender'] = subset_data['gender']

xtab, _ = group.get_crosstabs(aequitas_df)

# Ensure the reference groups dictionary contains all necessary references
# Fixing the KeyError: 0 by checking if the mode() returns a value or not
ref_groups_dict = {'score': xtab['score'].mode().iloc[0] if not xtab['score'].mode().empty else 0}

print(ref_groups_dict)

# Calculate the actual number of attributes in the input dataframe
actual_number_of_attributes = len(aequitas_df.columns)
print(f"Actual number of attributes in the input dataframe: {actual_number_of_attributes}")

# Check if ref_groups_dict has the necessary keys
if len(ref_groups_dict) < actual_number_of_attributes:
ref_groups_dict['label_value'] = 0

b = bias.get_disparity_predefined_groups(xtab, original_df=aequitas_df, ref_groups_dict=ref_groups_dict)
f = fairness.get_group_value_fairness(b)

print("Aequitas Bias and Fairness Results:")
print(f)

# Compute SHAP values
class BertModelWrapper:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer

def __call__(self, texts):
inputs = self.tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=128,
padding=True,
truncation=True,
return_tensors='pt'
)
logits = predict(inputs)
return logits.detach().numpy()

# Add a masker for text data
masker = shap.maskers.Text(tokenizer)
explainer = shap.Explainer(BertModelWrapper(model, tokenizer), masker=masker)
shap_values = explainer(texts)

# Visualize SHAP values
shap.summary_plot(shap_values, texts, class_names=['NONE', 'FAVOR', 'AGAINST'])

# LIME for local interpretability
lime_explainer = LimeTextExplainer(class_names=['NONE', 'FAVOR', 'AGAINST'])

def lime_predict_proba(texts):
inputs = tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=128,
padding=True,
truncation=True,
return_tensors='pt'
)
logits = predict(inputs)
proba = torch.nn.functional.softmax(logits, dim=1).detach().numpy()
return proba

# Explain a single instance with LIME
if sample_size > 0:
idx = 0 # Index of the instance to explain
lime_exp = lime_explainer.explain_instance(texts[idx], lime_predict_proba, num_features=10)
lime_exp.show_in_notebook(text=True)
else:
print("Dataset is empty after preprocessing.")
# Display results
print(fdf)

# Check available metrics in fdf
print(fdf.columns)

# Define the metrics to plot based on the available columns
available_metrics = ['tpr_disparity', 'fnr_disparity'] # Update this list based on the printed columns
groups = fdf['attribute_value'].unique()

# Plot results
fig, ax = plt.subplots(len(available_metrics), 1, figsize=(10, 15))

for i, metric in enumerate(available_metrics):
for group in groups:
group_data = fdf[fdf['attribute_value'] == group]
ax[i].bar(group, group_data[metric].values[0], label=f'{group} {metric}')
ax[i].set_title(f'{metric}')
ax[i].set_xlabel('Groups')
ax[i].set_ylabel('Disparity')
ax[i].legend()

plt.tight_layout()
plt.show()

# SHAP analysis for model explainability
# Tokenize the data
tokenized_data = data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Pad sequences to the same length
max_len = max([len(i) for i in tokenized_data])
padded_data = np.array([i + [0]*(max_len-len(i)) for i in tokenized_data])

# Define a prediction function
def predict(inputs):
inputs = torch.tensor(inputs).to(torch.int64) # Ensure inputs are in the correct format
with torch.no_grad():
outputs = model(inputs)[0]
return outputs.cpu().numpy()

# Create SHAP explainer using KernelExplainer
background = padded_data[:100] # Use a subset as the background for the explainer
explainer = shap.KernelExplainer(predict, background)

# Select a subset of data to explain
sample_data = padded_data[:10]

# Get SHAP values
shap_values = explainer.shap_values(sample_data)

# Plot SHAP values for the first prediction
shap.summary_plot(shap_values, features=sample_data, feature_names=tokenizer.convert_ids_to_tokens(range(max_len)))
Loading

0 comments on commit ad2a88d

Please sign in to comment.