-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataframes.py
56 lines (44 loc) · 2.93 KB
/
dataframes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import ast
from unidecode import unidecode
import re
## AUX. FUNCTION TO TRANSFORM TITLES AND NAMES
def string_transformation(text):
if type(text) == str:
text = text.lower().strip().replace(" ", "")
text = unidecode(text) # delete accents
text = re.sub(r'[^\w\s]', '', text) # delete special characters and punctuation marks
return text
else:
return "Entered value is not valid."
## DATA LOADING AT THE BEGINNING SO THE FUNCTIONS DON'T HAVE TO DO IT EVERY TIME
df_movies = pd.read_csv("./processed_data/movies.csv")
df_crew = pd.read_csv("./processed_data/crew.csv")
actor_financial = pd.read_csv("./processed_data/actor_financial.csv")
director_financial = pd.read_csv("./processed_data/director_financial.csv")
## LOWERCASE THESE TO GET A MATCH IN THE QUERYS
df_movies["release_month"] = [x.lower() for x in df_movies["release_month"]]
df_movies["release_day"] = [x.lower() for x in df_movies["release_day"]]
## TRANSFORMING STRINGS TO LIST
df_movies["genres_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["genres_list"]]
df_movies["directors"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["directors"]]
df_movies["spoken_languages_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["spoken_languages_list"]]
df_movies["production_countries_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_countries_list"]]
df_movies["production_companies_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_companies_list"]]
## EXTRACTING THE ELEMENTS OF THE LISTS
df_movies["genres_list"] = [x if None else ", ".join(x) for x in df_movies["genres_list"]]
df_movies["directors"] = [x if None else ", ".join(x) for x in df_movies["directors"]]
df_movies["spoken_languages_list"] = [x if None else ", ".join(x) for x in df_movies["spoken_languages_list"]]
df_movies["production_countries_list"] = [x if None else ", ".join(x) for x in df_movies["production_countries_list"]]
df_movies["production_companies_list"] = [x if None else ", ".join(x) for x in df_movies["production_companies_list"]]
## MAKING THE CORPUS FOR MODEL
# genres and collection are added twice to give more weight to those attributes
df_movies["corpus"] = (df_movies["title"].fillna("") + ", " + df_movies["genres_list"].fillna("")
+ ", " + df_movies["overview"].fillna("") + ", " + df_movies["directors"].fillna("") + ", " + df_movies["collection"].fillna("")
+ ", " + df_movies["genres_list"].fillna("") + ", " + df_movies["collection"].fillna("") )
df_movies["corpus"]
## TRANSFORMING TITLE TO EASE THE SEARCHING
df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
## DATA FOR FIT
chosen_columns = ["title", "transformed_title", 'genres_list', "directors", "corpus"]
df_train = df_movies[df_movies["vote_count"] >= 250][chosen_columns].reset_index()