-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.py
19 lines (15 loc) · 1.02 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import pandas as pd
import numpy as np
columns = ['job_title','Company', 'working_place','website', 'posted_time', 'working_type', 'job_description']
data = pd.read_csv('job_list.csv', names=columns, index_col=False)
#parsing job_descriptions.
data['big_data'] = data['job_description'].apply(lambda x: 1 if 'big data' in x.lower() else 0)
data['analysis'] = data['job_description'].apply(lambda x: 1 if 'analysis' in x.lower() else 0)
data['data_visualization'] = data['job_description'].apply(lambda x: 1 if 'visualization' in x.lower() else 0)
data['Hadoop'] = data['job_description'].apply(lambda x: 1 if 'hadoop' in x.lower() else 0)
data['Database'] = data['job_description'].apply(lambda x: 1 if 'database' in x.lower() else 0)
data['Apache'] = data['job_description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)
#data['r_studio'] = data['job_description'].apply(lambda x: 1 if 'r-studio' in x.lower() else 0)
#parsing the posted_time
data.to_csv('jobs_data_cleaned.csv', index=False)
test = pd.read_csv('jobs_data_cleaned.csv')