Skip to content

Commit 8acf544

Browse files
committed
idek anymore
1 parent 3d7966c commit 8acf544

4 files changed

+41
-23
lines changed

emotion_classifier.py

+35-15
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import cv2
2-
import torch
32
import numpy as np
4-
from fer import FER
3+
from feat import Detector
54

6-
# Initialize Face Detector & Emotion Recognizer
7-
detector = FER()
5+
# Initialize Py-Feat Detector
6+
detector = Detector(
7+
face_model="retinaface",
8+
landmark_model="mobilenet",
9+
au_model="xgb",
10+
emotion_model="resmasknet"
11+
)
812

913
# Open webcam
1014
cap = cv2.VideoCapture(0)
@@ -14,21 +18,37 @@
1418
if not ret:
1519
break
1620

17-
# Detect emotion using FER
18-
results = detector.detect_emotions(frame)
21+
try:
22+
# Detect faces
23+
face_results = detector.detect_faces(frame)
1924

20-
for result in results:
21-
(x, y, w, h) = result["box"]
22-
emotion, score = max(result["emotions"].items(), key=lambda item: item[1])
25+
if face_results and len(face_results) > 0:
26+
for face in face_results:
27+
bbox = face["bbox"] # Correct way to extract bounding box
28+
landmarks = face["landmarks"] # Correct way to extract landmarks
2329

24-
# Draw face box
25-
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
30+
# Ensure bbox is correctly formatted as (x, y, w, h)
31+
x, y, w, h = map(int, bbox)
2632

27-
# Display emotion classification
28-
cv2.putText(frame, f"{emotion} ({score:.2f})", (x, y-10), cv2.FONT_HERSHEY_SIMPLEX,
29-
0.8, (0, 255, 0), 2, cv2.LINE_AA)
33+
# Get emotion predictions
34+
emotion_scores = detector.detect_emotions(frame, facebox=bbox, landmarks=landmarks)
35+
36+
if not emotion_scores.empty:
37+
dominant_emotion = emotion_scores.idxmax(axis=1).values[0] # Get most probable emotion
38+
else:
39+
dominant_emotion = "Unknown"
3040

31-
cv2.imshow("Emotion Classification", frame)
41+
# Draw face bounding box
42+
cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
43+
44+
# Display dominant emotion
45+
cv2.putText(frame, f"Emotion: {dominant_emotion}", (x, y - 10),
46+
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)
47+
48+
except Exception as e:
49+
print(f"Error: {e}")
50+
51+
cv2.imshow("Py-Feat Emotion Classification", frame)
3252

3353
if cv2.waitKey(1) & 0xFF == ord("q"): # Press 'q' to quit
3454
break

gemini_prompt.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ def speak(text):
88
engine.say(text)
99
engine.runAndWait()
1010

11-
# Configure Gemini API
1211
genai.configure(api_key=GEMINI_API_KEY)
1312

1413
# Load Gemini model
@@ -18,13 +17,13 @@ def speak(text):
1817
def generate_visual_context_summary(speech_text, pose_label, emotion_label, gesture_label):
1918
prompt = f"""
2019
Here is the information available from the other person talking to the user:
21-
- **Speech**: "{speech_text}"
20+
- **Speech from the other person in the conversation**: "{speech_text}"
2221
- **Pose**: {pose_label}
2322
- **Emotion**: {emotion_label}
2423
- **Gesture**: {gesture_label}
2524
2625
You are assisting a blind or neurodivergent user by providing a concise one-sentence summary of the visual context they are missing using the information provided. In addition to the analysis this sentence should provide, it should also include all pieces of information available except for the speech in the sentence.
27-
The summary should be natural, clear, and informative without unnecessary details.
26+
The summary should be natural, clear, and informative without unnecessary details. But, it should give context to the user about visual information. For example, if the speech is a greeting and a hand is raised, it means they are waving.
2827
"""
2928

3029
# Send prompt to Gemini API
@@ -33,12 +32,11 @@ def generate_visual_context_summary(speech_text, pose_label, emotion_label, gest
3332
return response.text # Get the AI-generated response
3433

3534
# Example data (Replace with real values)
36-
speech_text = "I'm really excited about this project!"
35+
speech_text = "Raya has a ball!"
3736
pose_label = "Open stance"
3837
emotion_label = "Happy"
39-
gesture_label = "Hand raised"
38+
gesture_label = "Right Hand Raised"
4039

41-
# Call function and print result
4240
summary = generate_visual_context_summary(speech_text, pose_label, emotion_label, gesture_label)
4341
print(summary)
4442
speak(summary)

recorded_audio.wav

240 KB
Binary file not shown.

speech_recorder.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# Audio settings
99
FORMAT = pyaudio.paInt16
10-
CHANNELS = 1
10+
CHANNELS = 1
1111
RATE = 44100 # Sample rate
1212
CHUNK = 1024 # Buffer size
1313
OUTPUT_FILENAME = "recorded_audio.wav"
@@ -18,7 +18,7 @@ def record_speech(button="space"):
1818
print(f"Press and hold '{button}' to record speech...")
1919

2020
# Wait until the button is pressed
21-
while not keyboard.is_pressed(button):
21+
while not keyboard.is_pressed(button):
2222
time.sleep(0.01) # Prevent CPU overuse
2323

2424
print("Recording... Speak now.")

0 commit comments

Comments
 (0)