-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPracticaClasificacion.sas
178 lines (119 loc) · 5.04 KB
/
PracticaClasificacion.sas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
libname lib_data '/home/YOUR_FOLDER_HERE/data';
/* PASO DATA */
/* carga de datos */
DATA lib_data.babyClassWeigths;
set lib_data.birthwgt;
run;
/* Análisis exploratorio */
title 'Primeros registros del dataset';
proc print data= lib_data.babyClassWeigths (obs=15);
run;
/* frecuencias */
proc freq data=lib_data.babyClassWeigths;
tables LowBirthWgt AgeGroup Death Drinking Married Race Smoking SomeCollege / nocum plots=freqplot;
run;
/* prepraración de los datos */
/* dicomtomizar las variables categóricas y asignar missings a algunas de las categorías */
data lib_data.babyWDummy (drop=LowBirthWgt AgeGroup Death Married Race Smoking Drinking SomeCollege);
set lib_data.babyClassWeigths;
/* variable objetivo en numérica */
if LowBirthWgt='Yes' then lowBW=1; else lowBW=0;
/*variables de clsificación */
/*convierto los tres posibles grupos en variables dicotómicas */
if AgeGroup=1 then AgeGroup1=1; else AgeGroup1=0;
if AgeGroup=2 then AgeGroup2=1; else AgeGroup2=0;
if AgeGroup=3 then AgeGroup3=1; else AgeGroup3=0;
if Death='Yes' then xDeath=1; else xDeath=0;
if Married='Yes' then xMarried=1; else xMarried=0;
if Race='Asian' then RaceAsian=1; else RaceAsian=0;
if Race='Black' then RaceBlack=1; else RaceBlack=0;
if Race='Hispanic' then RaceHispanic=1; else RaceHispanic=0;
if Race='Native' then RaceNative=1; else RaceNative=0;
if Race='White' then RaceWhite=1; else RaceWhite=0;
/*en el caso de Drinking el No es lo más frecuente ppor lo que los missings se los apuntamos al No*/
if Drinking='Yes' then xDrinking=1; else if Drinking='No' then xDrinking=0;
if Drinking=' ' then xDrinking=0;
if SomeCollege='Yes' then xSomeCollege=1; else if SomeCollege='No' then xSomeCollege=0;
/*en este caso los missings los tenemos que distribuir al 50%, ya que es muy alta la frecuencia que tienen */
if xSomeCollege=. then do;
/*Si vamos por una fila impar, le asignamos 1, si es par 0, así distribuimos al 50%*/
if mod(_N_,2)=1 then xSomeCollege=1; else xSomeCollege=0;
end;
/* los missing va a la mayoritaria, No */
if Smoking='Yes' then xSmoking=1; else if Smoking='No' then xSmoking=0;
if Smoking=' ' then xSmoking=0;
run;
/* Para hacer más entendibles las variables que hemos dicotomizado vamos a formatear los valores a
las diferentes categorías */
proc format;
value labelsLow 0 = "Sin bajo Peso"
1 = "Con Bajo Peso";
value labelsAgeOne 0 = "No Grupo 1"
1 = "Grupo 1";
value labelsAgeTwo 0 = "No Grupo 2"
1 = "Grupo 2";
value labelsAgeThree 0 = "No Grupo 3"
1 = "Grupo 3";
value labelsDeath 0 = "Vivo"
1 = "Muerto";
value labelsMarried 0 = "No Casada"
1 = "Casada";
value labelsCollege 0 = "Sin Estudios superiores"
1 = "Con Estudios Superiores";
value labelsDrinking 0 = "No bebe"
1 = "Bebe";
value labelsSmoking 0 = "No fuma"
1 = "Fuma";
value labelsRaceAsian 0 = "No Asiático"
1 = "Asiático";
value labelsRaceBlack 0 = "No Negro"
1 = "Negro";
value labelsRaceHispanic 0 = "No Hispano"
1 = "Hispano";
value labelsRaceNative 0 = "No Nativo Americano"
1 = "Nativo Americano";
value labelsRaceWhite 0 = "No Blanco"
1 = "Blanco";
run;
proc freq data=lib_data.babyWDummy;
tables lowBW AgeGroup1 AgeGroup2 AgeGroup3 xDeath xDrinking xMarried RaceAsian RaceBlack RaceHispanic RaceNative RaceWhite xSmoking xSomeCollege / nocum plots=freqplot;
format lowBW labelsLow.
AgeGroup1 labelsAgeOne.
AgeGroup2 labelsAgeTwo.
AgeGroup3 labelsAgeThree.
xDeath labelsDeath.
xDrinking labelsDrinking.
xMarried labelsMarried.
RaceAsian labelsRaceAsian.
RaceBlack labelsRaceBlack.
RaceHispanic labelsRaceHispanic.
RaceNative labelsRaceNative.
RaceWhite labelsRaceWhite.
xSmoking labelsSmoking.
xSomeCollege labelsCollege.;
run;
/* ordenamos y eliminamos duplicados */
proc sort nodupkey data=lib_data.babyWDummy;
by lowBW AgeGroup1 AgeGroup2 AgeGroup3 xDeath xDrinking xMarried RaceAsian RaceBlack RaceHispanic RaceNative RaceWhite xSmoking xSomeCollege;
run;
/* Analisis de correlacion */
proc corr data=lib_data.babyWDummy;
var _numeric_;
run;
/* Clusters */
ods graphics on;
proc cluster data=lib_data.babyWDummy method=centroid
nonorm ccc pseudo rmsstd rsquare
out=lib_data.resultados plots=den(height=rsq) PRINT=20 plots(maxpoints=300);
var lowBW AgeGroup1 AgeGroup2 AgeGroup3 xDeath xDrinking xMarried RaceAsian RaceBlack RaceHispanic RaceNative RaceWhite xSmoking xSomeCollege;
run;
proc cluster data=lib_data.babyWDummy method=ward
nonorm ccc pseudo rmsstd rsquare
out=lib_data.resultados plots=den(height=rsq) PRINT=20 plots(maxpoints=300);
var lowBW AgeGroup1 AgeGroup2 AgeGroup3 xDeath xDrinking xMarried RaceAsian RaceBlack RaceHispanic RaceNative RaceWhite xSmoking xSomeCollege;
run;
proc varclus data=lib_data.babyWDummy outtree=tree centroid maxclusters=9
plots=dendrogram(vertical height=ncl);
var _numeric_;
run;
ods graphics off;