-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinear-Parameter-estimation.html
738 lines (686 loc) · 241 KB
/
Linear-Parameter-estimation.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Regression models</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2></h2>
<ol>
<li><a href="machine-learning.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<div class="swiper-slide">
<h1>Regression Model parameter estimation</h1>
<figure>
<img src="assets/img/data-engineering/Linear-reg1.png" alt="" style="max-width: 60%; max-height: 60%;">
<figcaption></figcaption>
</figure>
</div>
</div>
<div class="swiper-pagination"></div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Content</h3>
<ol>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#Relationship-of-regression-lines">Relationship of regression lines</a></li>
<li><a href="#Types-of-Linear-Regression">Types of Linear Regression</a></li>
<li><a href="#Mathematical-1">Mathematical Explanation</a></li>
<li><a href="#Assumption-of-LR">Assumptions of Linear Regression</a></li>
<li><a href="#evaluation-metrics-for-LR">Evaluation Metrics for Linear Regression</a></li>
<li><a href="#overfit-goodfit-underfit">Overfitting, Good Fit, and Underfitting in Machine Learning</a></li>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section id="introduction">
<h2>Intorduction</h2>
<ul>
<li>The values of \(\beta_0\), \(\beta_1\), and \(\sigma^2\) will almost never be known to an investigator.</li>
<li><p>Instead, sample data consists of n observed pairs</p>
<p>(\(x_1\), \(y_1\)), … , (\(x_n \), \(y_n\)),</p>
<p>from which the model parameters and the true regression line itself can be estimated.</p>
</li>
<li><p>The data (pairs) are assumed to have been obtained independently of one another.</p>
<p>where</p>
<p>\(Y_i =\beta_0+\beta_1 x_i + \epsilon_i\) for \(i = 1, 2, … , n\)</p>
<p>and the \(n\) deviations \(\epsilon_1, \epsilon_2, ..., \epsilon_n\)</p>
</li>
<li><p>The “best fit” line is motivated by the principle of least squares, which can be traced back to the German mathematician Gauss (1777–1855):</p>
<img src="assets/img/data-engineering/Multi-lin-reg.png" alt="" style="max-width: 60%; max-height: 60%;">
</li>
</ul>
<p>A line provides the best fit to the data if the sum of the squared vertical distances (deviations) from the observed points to that line is as small as it can be.</p>
<ul>
<li><p>The sum of squared vertical deviations from the points \((x_1, y_1),…, (x_n, y_n)\) to the line is then:</p>
<p>\(f(b_0, b_1) = \sum_{i=1}^n [y_i - (b_0+b_1 x_i)]^2\)</p>
</li>
<li><p>The point estimates of \(\beta_0\) and \(\beta_1\), denoted by \(\hat{\beta}_0\) and \(\hat{\beta}_1\), are called the least squares estimates they are those values that minimize \(f(b_0, b_1)\).</p>
</li>
<li><p>The fitted regression line or least squares line is then the line whose equation is:</p>
<p>\(y = \hat{\beta}_0+\hat{\beta}_1 x\).</p>
</li>
</ul>
</section>
<section id="Estimation-par">
<ul>
<li><p>The minimizing values of b0 and b1 are found by taking partial derivatives of \(f(b_0, b_1)\) with respect to both \(b_0\) and \(b_1\), equating them both to zero [analogously to \(fʹ(b) = 0\) in univariate calculus], and solving the equations</p>
<p>\(\frac{\partial f(b_0, b_1)}{\partial b_0} = \sum 2 (y_i - b_0 - b_1 x_i) (-1) = 0\)</p>
<p>\(\frac{\partial f(b_0, b_1)}{\partial b_1} = \sum 2 (y_i - b_0 - b_1 x_i) (-x_i) = 0\).</p>
</li>
<li><p>Which in term gives two equations:</p>
<p>\(\sum (y_i - b_0 - b_1 x_i) = 0\)</p>
<p>\(\sum (y_i x_i- b_0x_i - b_1 x_i^2) = 0\).</p>
<p>after some simplification, we can get</p>
<p>\(\boxed{b_1 = \hat{\beta}_1 = \frac{\sum (x_i - \bar{x})(y_i - \bar{y})}{\sum (x_i - \bar{x})^2} = \frac{S_{xy}}{S_{xx}}}\)</p>
<p>where</p>
<ul>
<li>\(S_{xy}= \sum x_i y_i - \frac{\sum x_i \sum y_i}{n}\) </li>
<li>\(S_{xx} = \sum x_i^2 - \frac{\sum x_i^2}{n}\)</li>
</ul>
<p>(Typically columns for \(x_i, y_i, x_i y_i\) and \(x_i^2\) and constructed and then \(S_{xy}\) and \(S_{xx}\) are calculated.)</p>
</li>
<li><p>The least squares estimate of the intercept \(\beta_0\) of the true regression line is</p>
<p>\(\boxed{b_0 = \hat{\beta}_0 = \frac{\sum y_i - \hat{\beta}_1 \sum x_i}{n} = \bar{y}- \hat{\beta}_1 \bar{x}}\).</p>
</li>
<li><p>The computational formulas for \(S_{xy}\) and \(S_{xx}\) require only the summary statistics \(\sum x_i, \sum y_i, \sum x_i y_i\) and \(\sum x_i^2\) (\(\sum y_i^2\) will be needed shortly for the variance.)</p>
</li>
</ul>
</section>
<section id="Fitted-values">
<h2 id="Fitted-values">Fitted values<a class="anchor-link" href="#Fitted-values">¶</a></h2><h3 id="1.-Fitted-values">1. Fitted values<a class="anchor-link" href="#1.-Fitted-values">¶</a></h3><p>The fitted (or predicted) values \(\hat{y}_1\), \(\hat{y}_2\), ...., \(\hat{y}_n\) are obtained by substituting \(x_1, x_2, ...., x_n\) into the equation of the estimated regression line:</p>
<ul>
<li>\(\hat{y}_1 = \hat{\beta}_0 + \hat{\beta}_1 x_1\)</li>
<li>\(\hat{y}_2 = \hat{\beta}_0 + \hat{\beta}_1 x_1\)</li>
<li>.</li>
<li>.</li>
<li>.</li>
<li>\(\hat{y}_n = \hat{\beta}_0 + \hat{\beta}_1 x_n\)</li>
</ul>
<h3 id="2.-Residuals">2. Residuals<a class="anchor-link" href="#2.-Residuals">¶</a></h3><ul>
<li>The differences \(y_1 - \hat{y}_1\), \(y_2 - \hat{y}_2\), ....., \(y_n - \hat{y}_n\) between the observed and fittted \(y\) values.</li>
<li><p>When the estimated regression line is obtained via the principle of least squares, the sum of the residuals should in theory be zero, if the error distribution is symmetric, since</p>
<p>\(\sum (y_i - (\hat{\beta}_0+ \hat{\beta}_1 x_i)) = n \bar{y}- n \hat{\beta}_0 - \hat{\beta}_1 n \bar{x} = n \hat{\beta}_0 - n \hat{\beta}_0 = 0\).</p>
<ul>
<li>\(y_i-\hat{y}_i > 0 \Rightarrow \) if the point \((x_i y_i)\) lies above the line</li>
<li>\(y_i-\hat{y}_i <> 0 \Rightarrow \) if the point \((x_i y_i)\) lies below the line</li>
</ul>
</li>
<li><p>The residual can be thought of as a measure of deviation and we can summarize the notation in the following way:</p>
<p>\(Y_i - \hat{Y}_i = \hat{\epsilon}_i\)</p>
</li>
</ul>
<h3 id="3.-Estimating-\(%5Csigma%5E2\)-and-\(%5Csigma\)">3. Estimating \(\sigma^2\) and \(\sigma\)<a class="anchor-link" href="#3.-Estimating-\(%5Csigma%5E2\)-and-\(%5Csigma\)">¶</a></h3><ul>
<li><p>The parameter \(\sigma^2\) determines the amount of spread about the true regression line.</p>
<p><img src="assets/img/data-engineering/linear-spread.png" alt="" style="max-width: 60%; max-height: 60%;"></p>
</li>
<li><p>An estimates of \(\sigma^2\) will be used in confidence interval (CI)formulas and hypothesis-testing procedures presented in the next two sections.</p>
</li>
<li>Many large deviations (residuals) suggest a large value of \(\sigma^2\), whereas deviations all of which are small in magnitude suggest that \(\sigma^2\) is small. </li>
<li><p><strong>Error sum of squares (SSE):</strong> The error sum of squares SSE can be interpreted as a measure of how much variation in y is left unexplained by the model—that is, how much cannot be attributed to a linear relationship.</p>
<p>The SSE (equivalently, residual sum of squares), denoted by SSE is:</p>
<p>\(\boxed{{\rm SSE} = \sum (y_i - \hat{y}_i)^2 = \sum [y_i - (\hat{\beta}_0+\hat{\beta}_1 x_i)]^2}\)</p>
<p>and the estimates of \(\sigma^2\) is</p>
<p>\(\hat{\sigma}^2 =s^2 = \frac{\text{SSE}}{n-2} = \frac{\sum (y - \hat{y}_i)^2}{n-2} = \frac{1}{n-2} \sum_{i=1}^n \hat{e}_i^2\).</p>
<p>(Note that the homoscedasticity assumption comes into play here).</p>
</li>
<li><p>The divisor \(n – 2\) in \(s^2\) is the number of degrees of freedom \((df)\) associated with SSE and the estimate \(s^2\).</p>
</li>
<li>This is because to obtain \(s^2\), the two parameters \(\beta_0\) and \(\beta_1\) must first be estimated, which results in a loss of \(2\) \(df\) (just as \(\mu\) had to be estimated in one sample problems, resulting in an estimated variance based on \(n – 1\) df in our previous t-tests).</li>
<li><p>Computation of SSE from the defining formula involves much tedious arithmetic, because both the predicted values and residuals must first be calculated.</p>
<p><img src="" alt="image.png"></p>
<ul>
<li><p>The points in the first plot all fall exactly on a straight line. In this case, all (\(100\%\)) of the sample variation in <em>y</em> can be attributed to the fact that <em>x</em> and <em>y</em> are linearly related in combination with variation in <em>x</em>.</p>
</li>
<li><p>The points in the second plot do not fall exactly on a line, but compared to overall y variability, the deviations from the least squares line are small.</p>
</li>
<li><p>It is reasonable to conclude in this case that much of the observed y variation can be attributed to the approximate linear relationship between the variables postulated by the simple linear regression model.</p>
</li>
<li>When the scatter plot looks like that in the third plot, there is substantial variation about the least squares line relative to overall y variation, so the simple linear regression model fails to explain variation in y by relating y to x.</li>
</ul>
<p>In the first plot SSE = 0, and there is no unexplained variation, whereas unexplained variation is small for second, and large for the third plot.</p>
</li>
</ul>
<h3>4. Total sum of squares (SST) or Total Variation<a class="anchor-link" href="#4.-Total-sum-of-squares-(SST)-or-Total-Variation">¶</a></h3><ul>
<li><p>A quantitative measure of the total amount of variation in observed y values is given by the total sum of squares.</p>
<p>\(\boxed{{\rm SST} = S_{yy} = \sum (y_i -\bar{y})^2 = \sum y_i^2 - \frac{(\sum y_i)^2}{n}}\).</p>
</li>
<li><p>The SST is the sum of squared deviations about the sample mean of the observed y values – when no predictors are taken into account</p>
</li>
</ul>
<h4>4.1. Difference between SST and SSE:<a class="anchor-link" href="#4.1.-Difference-between-SST-and-SSE:">¶</a></h4><ul>
<li><p>The SST in some sense is as bad as SSE can get if there is no regression model (i.e., slope is 0) then</p>
<p>\(\hat{\beta}_0 = \bar{y}- \hat{\beta}_1 \bar{x} \Rightarrow \hat{y} = \hat{\beta}_0+\underbrace{\hat{\beta}_1}_{=0} \bar{x} = \hat{\beta}_0 = \bar{y}\)</p>
<p><img src="assets/img/data-engineering/linear-spread2.png" alt="" style="max-width: 70%; max-height: 70%;"></p>
<p>The SSE < SST unless the horizontal line itself is the least square line.</p>
</li>
</ul>
<h3>5. Coefficient of determination (\(r^2\))<a class="anchor-link" href="#5.-Coefficient-of-determination-(\(r%5E2\))">¶</a></h3><p>\(\boxed{r^2 = 1- \frac{{\rm SSE}}{{\rm SST}}} \Rightarrow \) (a number between 0 and 1.)</p>
<ul>
<li>The ratio SSE/SST is the proportion of total variation that cannot be explained by the simple linear regression mode and \(r^2\) is the proportion of the observed \(y\) variation explained by the model.</li>
<li>It is interpreted as the proportion of observed y variation that can be explained by the simple linear regression model (attributed to an approximate linear relationship between y and x).</li>
<li>The higher the value of \(r^2\), the more successful is the simple linear regression model in explaining y variation.</li>
</ul>
<h3 id="6.-Regression--sum-of-squares-(SSR)">6. Regression sum of squares (SSR)<a class="anchor-link" href="#6.-Regression--sum-of-squares-(SSR)">¶</a></h3><p>The coefficient of determination can be written in a slightly different way by introducing a third sum of squares—regression sum of squares, SSR—given by:</p>
<p>\(\boxed{{\rm SSR} = \sum (\hat{y}_i - \bar{y})^2 = {\rm SST}- {\rm SSE}}\).</p>
<p>Regression sum of squares is interpreted as the amount of total variation that is explained by the model.</p>
<p>Then we have</p>
<p>\(\boxed{r^2 = 1- \frac{{\rm SSE}}{{\rm SST}} = \frac{{\rm SST} - {\rm SSE}}{{\rm SST}} = \frac{{\rm SSR}}{{\rm SST}}} = \frac{\text{Explained Variation}}{\text{Total Variation}}\)</p>
<p>the ratio of explained variation to total variation.</p>
<p><img src="assets/img/data-engineering/linear-spread3.png" alt="" style="max-width: 60%; max-height: 60%;"></p>
</section>
<section id="Hypothesis">
<ul>
<li><p>Testing for significance using the slope, \(\beta_1\):</p>
<p><img src="assets/img/data-engineering/hypo1.png" alt="" style="max-width: 40%; max-height: 40%;"></p>
<ul>
<li>If \(\beta_1 = 0\), then \(y=\beta_0\), no matter what value \(x\) is.</li>
<li>Therefore there is no linear relationship between \(x\) and \(y\) when \(\beta_1 = 0\).</li>
</ul>
</li>
<li><p><strong>Hypothesis test of significance, t-test:</strong></p>
<p>The most commonly encountered pair of hypotheses about \(\beta_1\)</p>
<ul>
<li>\(H_0: \beta_1 = 0 \) </li>
<li><p>\(H_a: \beta_1 \neq 0\).</p>
<p>We are going to see if we have enough evidence to support the alternative hypothesis that the slope is not equal to zero. If we will find a evidence, we will conclude that there is a linear relationship between \(x\) and \(y\).</p>
<p>Here Test statistics: \(\boxed{t=\frac{b_1}{S_{b_1}}}\),</p>
<p>where \(S_{b_1}\) is the standard error for the slope. To calculate this, we use following formula:</p>
<p>\(\boxed{S_{b_1} = \frac{s}{\sqrt{\sum(x_i - \bar{x}_i)^2}}}\),</p>
<p>where \(s = \sqrt{\frac{\text{SSE}}{n-2}}\)</p>
</li>
</ul>
</li>
</ul>
<h3 id="Ways-to-perform-hypothesis-testing">Ways to perform hypothesis testing<a class="anchor-link" href="#Ways-to-perform-hypothesis-testing">¶</a></h3><p>There are several ways to check the null and alternative hypotheses when performing hypothesis testing. Here are some common approaches:</p>
<ol>
<li><p><strong>Critical Value Approach:</strong> The critical value approach involves comparing a test statistic (calculated from the data) to a predetermined critical value based on the chosen significance level (alpha). The steps involved in the critical value approach are as follows:</p>
<ul>
<li>Null hypothesis (H0) and alternative hypothesis (Ha) are defined.</li>
<li>Test statistic (e.g., z-score or t-statistic) is calculated based on the sample data.</li>
<li>Critical value(s) (denoted as z_crit or t_crit) are determined based on the chosen significance level (alpha) and the distribution of the test statistic. </li>
<li><p><strong>Comparison:</strong> If the test statistic falls within the critical region (i.e.,</p>
<ul>
<li>test statistic > critical value for a right-tailed test or </li>
<li><p>test statistic < critical value for a left-tailed test),</p>
<p>reject the null hypothesis in favor of the alternative hypothesis. Otherwise, if the test statistic falls outside the critical region, fail to reject the null hypothesis.</p>
</li>
</ul>
</li>
</ul>
<p>The critical value approach is commonly used in tests such as z-tests and t-tests, where critical values are obtained from standard tables or calculated based on the desired significance level and the test's distribution.</p>
</li>
<li><p><strong>P-Value Approach:</strong> The p-value approach, also known as the probability approach, involves calculating the p-value associated with the observed test statistic. The p-value is the probability of obtaining a test statistic as extreme or more extreme than the observed value, assuming that the null hypothesis is true. The steps involved in the p-value approach are as follows:</p>
<ul>
<li>Null hypothesis (H0) and alternative hypothesis (Ha) are defined.</li>
<li>Test statistic (e.g., z-score or t-statistic) is calculated based on the sample data.</li>
<li>P-value is calculated, representing the probability of obtaining a test statistic as extreme or more extreme than the observed value, assuming the null hypothesis is true.</li>
<li><strong>Comparison:</strong> If the p-value is less than or equal to the chosen significance level (alpha), reject the null hypothesis in favor of the alternative hypothesis. Otherwise, if the p-value is greater than the significance level, fail to reject the null hypothesis.</li>
</ul>
<p>The p-value approach provides a measure of the strength of evidence against the null hypothesis. A smaller p-value indicates stronger evidence against the null hypothesis, suggesting that the observed data is unlikely to occur if the null hypothesis is true. The p-value approach allows for more flexibility in choosing significance levels and can be used with a wide range of statistical tests.</p>
</li>
<li><p><strong>Confidence Interval Approach:</strong></p>
<ul>
<li>Test statistic and its standard error are calculated based on the sample data.</li>
<li>Confidence interval is constructed around the test statistic, typically using the formula: test statistic ± (critical value * standard error).</li>
<li><strong>Comparison:</strong> If the null hypothesis value falls outside the confidence interval, reject the null hypothesis. Otherwise, if the null hypothesis value is inside the confidence interval, fail to reject the null hypothesis.</li>
</ul>
</li>
<li><p><strong>Likelihood Ratio Test:</strong></p>
<ul>
<li>Likelihood of the data under the null hypothesis (L(H0)) and alternative hypothesis (L(Ha)) is calculated based on the sample data.</li>
<li><p>Likelihood ratio is computed as the ratio of the likelihoods:</p>
<p>\(\boxed{\text{likelihood ratio} = \frac{L(Ha)}{L(H0)}}\).</p>
</li>
<li><p><strong>Comparison:</strong> If the likelihood ratio is greater than the critical value corresponding to the chosen significance level or if the p-value associated with the likelihood ratio is less than the chosen significance level, reject the null hypothesis. Otherwise, if the likelihood ratio is not greater than the critical value or the p-value is not less than the significance level, fail to reject the null hypothesis.</p>
</li>
</ul>
</li>
<li><p><strong>Bayesian Approach:</strong></p>
<ul>
<li>Prior probabilities (P(H0) and P(Ha)) are specified for the null and alternative hypotheses.</li>
<li><p>Posterior probabilities (P(H0|data) and P(Ha|data)) are calculated using Bayes' theorem:</p>
<p>\(\boxed{P(H0|\text{data}) = \frac{P(H0) * P(\text{data}|H0)}{P(\text{data})}}\) and</p>
<p>\(\boxed{P(Ha|\text{data}) = \frac{P(Ha) * P(\text{data}|Ha)}{P(\text{data})}}\),</p>
<p>where P(data) is the marginal likelihood.</p>
</li>
<li><p><strong>Comparison:</strong> Decision is made based on the posterior probabilities, such as comparing P(H0|data) to a threshold. If P(H0|data) is lower than the threshold, reject the null hypothesis. Otherwise, if P(H0|data) is higher than or equal to the threshold, fail to reject the null hypothesis.</p>
</li>
</ul>
</li>
</ol>
<h3 id="Test-statistic">Test statistic<a class="anchor-link" href="#Test-statistic">¶</a></h3><h4 id="1.-z-score">1. z-score<a class="anchor-link" href="#1.-z-score">¶</a></h4><ul>
<li>The z-score is used in hypothesis testing when the sample size is large or when the population standard deviation is known.</li>
<li><p>The formula for calculating the z-score is:</p>
<p>\(\boxed{z = \frac{\bar{x} - μ}{(\sigma / \sqrt{n})}}\),</p>
<p>where \(\bar{x}\) is the sample mean, \(\mu\) is the population mean, \(\sigma\) is the population standard deviation, and \(n\) is the sample size.</p>
</li>
<li><p>The z-score follows a standard normal distribution (mean = 0, standard deviation = 1), allowing for comparison to critical values or calculation of p-values.</p>
</li>
</ul>
<h4 id="2.-t-statistic">2. t-statistic<a class="anchor-link" href="#2.-t-statistic">¶</a></h4><ul>
<li>The t-statistic is used when the sample size is small and the population standard deviation is unknown.</li>
<li>The formula for calculating the t-statistic depends on the specific test being performed (e.g., one-sample t-test, independent samples t-test, paired samples t-test).</li>
<li>The t-statistic follows a t-distribution with degrees of freedom (df) determined by the sample size and the specific test being conducted.</li>
<li>Comparison to critical values or calculation of p-values is done using the t-distribution</li>
</ul>
<blockquote><p><strong>One-Sample t-Test:</strong></p>
<p>The one-sample t-test is used to determine if the mean of a single sample significantly differs from a specified population mean.</p>
<ul>
<li><p>The formula for calculating the t-statistic in a one-sample t-test is:</p>
<p>\(\boxed{t = \frac{x-\mu}{s/\sqrt{n}}}\),</p>
<p>where</p>
<ul>
<li>\(x\) is the sample mean, </li>
<li>\(\mu\) is the specified population mean, </li>
<li>\(s\) is the sample standard deviation, and </li>
<li>\(n\) is the sample size.</li>
</ul>
</li>
<li><p>The t-statistic follows a t-distribution with \((n - 1)\) degrees of freedom.</p>
</li>
<li>The null hypothesis (\(H0\)): is that the population mean is equal to the specified value (\(\mu\)), and </li>
<li>the alternative hypothesis (\(Ha\)): is that the population mean is not equal to \(\mu\).</li>
</ul>
<p><strong>Independent Samples t-Test:</strong></p>
<p>The independent samples t-test is used to compare the means of two independent groups and determine if they significantly differ from
each other.</p>
<ul>
<li><p>The formula for calculating the t-statistic in an independent samples t-test is:</p>
<p>\(\boxed{t = \frac{(x_1 - x_2)}{√((s_1^2 / n_1) + (s_2^2 / n_2))}}\),</p>
<p>where</p>
<ul>
<li>\(x_1\) and \(x_2\) are the sample means, </li>
<li>\(s_1\) and \(s_2\) are the sample standard deviations, </li>
<li>\(n_1\) and \(n_2\) are the sample sizes</li>
</ul>
<p>of the two groups.</p>
</li>
<li><p>The t-statistic follows a t-distribution with degrees of freedom calculated using a formula that takes into account the sample sizes
and variances of the two groups.</p>
</li>
<li>The null hypothesis (\(H0\)): is that the means of the two groups are equal, and </li>
<li>the alternative hypothesis (\(Ha\)) is that the means are not equal.</li>
</ul>
<p><strong>Paired Samples t-Test:</strong></p>
<p>The paired samples t-test, also known as the dependent samples t-test, is used to compare the means of two related or paired samples.</p>
<ul>
<li><p>The formula for calculating the t-statistic in a paired samples t-test is:</p>
<p>\(\boxed{t = \frac{(\bar{x}_d - \mu_d)}{(s_d / \sqrt{n})}}\),</p>
<p>where</p>
<ul>
<li>\(\bar{x}_d\) is the mean difference of the paired observations, </li>
<li>\(\mu_d\) is the specified population mean difference (usually \(0\) under the null hypothesis), </li>
<li>\(s_d\) is the standard deviation of the differences, and </li>
<li>\(n\) is the number of paired observations.</li>
</ul>
</li>
<li><p>The t-statistic follows a t-distribution with \((n - 1)\) degrees of freedom.</p>
</li>
<li>The null hypothesis (\(H0\)) is that the mean difference is equal to the specified value (\(\mu_d\), often \(0\)), and </li>
<li>the alternative hypothesis (\(Ha\)) is that the mean difference is not equal to \(\mu_d\).</li>
</ul>
</blockquote>
<h4 id="3.-F-Statistic:">3. F-Statistic:<a class="anchor-link" href="#3.-F-Statistic:">¶</a></h4><ul>
<li>The F-statistic is used in <em>analysis of variance (ANOVA)</em> tests to compare the variability between groups to the variability within groups.</li>
<li>The formula for calculating the F-statistic depends on the specific ANOVA test being performed (e.g., one-way ANOVA, two-way ANOVA).</li>
<li>The F-statistic follows an F-distribution with different degrees of freedom for the numerator and denominator, which are determined by the number of groups and sample sizes.</li>
<li>Comparison to critical values or calculation of p-values is done using the F-distribution.</li>
</ul>
<blockquote><p><strong>Analysis of Variance (ANOVA):</strong> is a statistical technique used to test for significant differences between the means of two or more
groups. ANOVA partitions the total variability in the data into different components to assess the impact of different sources of
variation. Here's an explanation of ANOVA with formulas:</p>
<ol>
<li><strong>One-Way ANOVA:</strong> </li>
</ol>
<ul>
<li>One-Way ANOVA is used when comparing the means of two or more groups on a single independent variable (factor). </li>
<li><p>The formula for calculating the F-statistic in a one-way ANOVA is:</p>
<p>\(F= \frac{SSB / (k - 1)}{(SSW / (n - k))}\),</p>
<p>where</p>
<ul>
<li>\(SSB\) is the between-group sum of squares: \(SSB = \sum n_i (\bar{x}_i-\bar{x})^2\), where \(n_i\) is the sample size of the ith
group, \(\bar{x}_i\) is the mean of the ith group, \(\bar{x}\) is the overall mean.</li>
<li>\(SSW\) is the within-group sum of squares: \(SSW = \sum \sum (x_i - \bar{x}_i)^2\), where \(x_i\) is an individual observation in the
ith group, \(\bar{x}_i\) is the mean of the ith group.</li>
<li>Sometimes, we also need SST = SSB + SSW.</li>
<li>\(k\) is the number of groups, and </li>
<li>\(n\) is the total sample size.</li>
<li>Check example: <a href="https://statkat.com/compute-sum-of-squares-ANOVA.php">https://statkat.com/compute-sum-of-squares-ANOVA.php</a></li>
</ul>
</li>
<li><p>SSB represents the variability between the group means, and SSW represents the variability within each group.</p>
</li>
<li>The F-statistic follows an F-distribution with \((k - 1)\) numerator degrees of freedom and \((n - k)\) denominator degrees of freedom.</li>
<li>The null hypothesis (H0): is that the means of all groups are equal, and </li>
<li>The alternative hypothesis (Ha): is that at least one group mean is different.</li>
</ul>
<ol>
<li><strong>Two-Way ANOVA:</strong></li>
</ol>
<ul>
<li>Two-Way ANOVA is used when comparing the means of two or more groups on two independent variables (factors).</li>
<li>The formula for calculating the F-statistic in a two-way ANOVA involves multiple sources of variation, including main effects and
interaction effects. </li>
<li>The specific formulas depend on the design of the study (e.g., balanced/unbalanced, fixed/random effects).</li>
<li>The F-statistic for each effect (main effect or interaction effect) is calculated by dividing the sum of squares for that effect by
the corresponding degrees of freedom and mean square error.</li>
<li>The F-statistics follow an F-distribution with appropriate degrees of freedom.</li>
<li>The null hypothesis (H0) is that there is no significant effect of the factors or their interaction on the response variable, and</li>
<li>The alternative hypothesis (Ha) is that there is a significant effect.</li>
</ul>
</blockquote>
<h4 id="4.-Chi-Square-Statistic:">4. Chi-Square Statistic:<a class="anchor-link" href="#4.-Chi-Square-Statistic:">¶</a></h4><ul>
<li>The chi-square statistic is used for testing relationships between <em>categorical variables</em> or for testing goodness of fit.</li>
<li>The formula for calculating the chi-square statistic depends on the specific test being conducted (e.g., chi-square test of independence, chi-square goodness of fit test).</li>
<li>The chi-square statistic follows a chi-square distribution with degrees of freedom determined by the number of categories or the degrees of freedom associated with the test.</li>
<li>Comparison to critical values or calculation of p-values is done using the chi-square distribution.</li>
</ul>
<blockquote><p>The Chi-Square statistic is a test statistic used in hypothesis testing to assess the relationship between categorical variables or to
test for goodness of fit. It compares the observed frequencies with the expected frequencies under a specified hypothesis.</p>
<ol>
<li><strong>Chi-Square Test of Independence:</strong></li>
</ol>
<ul>
<li>The Chi-Square test of independence is used to determine if there is a significant association between two categorical variables.</li>
<li><p>The formula for calculating the Chi-Square statistic in a test of independence is:</p>
<p>\(\chi^2 = \sum \frac{(O-E)^2}{E}\),</p>
<p>where \(\sum\) represents the summation symbol, \(O\) is the observed frequency in each cell of a contingency table, and \(E\) is the
expected frequency under the assumption of independence.</p>
</li>
<li>The observed frequencies (\(O\)) are the actual counts of observations in each cell of the contingency table, and the expected
frequencie</li>
<li>(\(E\)) are the counts that would be expected if the two variables were independent.</li>
<li>The Chi-Square statistic follows a Chi-Square distribution with degrees of freedom calculated based on the number of rows and columns
in the contingency table.</li>
</ul>
<ol>
<li><strong>Chi-Square Goodness of Fit Test:</strong></li>
</ol>
<ul>
<li>The Chi-Square goodness of fit test is used to determine if observed categorical data follows a specified distribution or expected
frequencies.</li>
<li><p>The formula for calculating the Chi-Square statistic in a goodness of fit test is:</p>
<p>\(\chi^2 = \sum \frac{(O-E)^2}{E}\),</p>
<p>where \(\sum\) represents the summation symbol, \(O\) is the observed frequency for each category, and \(E\) is the expected frequency
under the null hypothesis.</p>
</li>
<li>The observed frequencies (\(O\)) are the actual counts of observations in each category, and the expected frequencies (\(E\)) are the
counts that would be expected if the null hypothesis is true.</li>
<li>The Chi-Square statistic follows a Chi-Square distribution with degrees of freedom determined by the number of categories minus one.</li>
</ul>
</blockquote>
<h2 id="Calculation">Calculation<a class="anchor-link" href="#Calculation">¶</a></h2><p>To do all these statistics, we may need to find following:</p>
<table>
<thead><tr>
<th>xi</th>
<th>yi</th>
<th>yi_hat=beta0+beta1*xi</th>
<th>Error</th>
<th>Squared error (yi - yi_hat)^2</th>
<th>Deviation yi - ybar</th>
<th>Squared Deviation (yi - ybar)^2</th>
</tr>
</thead>
<tbody>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
<tr>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>SSE=...</td>
<td>SST=...</td>
</tr>
</tbody>
</table>
<p>where</p>
<ul>
<li>xi =\(x_i\)</li>
<li>yi = \(y_i\)</li>
<li>yi_hat = beta0 + beta1 * xi = \(\hat{y}_i = \hat{\beta}_0 + \hat{\beta}_1 x_i\) </li>
<li>Error (yi - yi_hat)^2 = Error \((y_i - \hat{y}_i)^2\) </li>
<li>Deviation yi - ybar = Deviation \(y_i - \bar{y}\) </li>
<li>Squared Deviation (yi - ybar)^2 = Squared deviation \((y_i - \bar{y})^2\( </li>
</ul>
<p>and</p>
<table>
<thead><tr>
<th>xi</th>
<th>yi</th>
<th>xi-xbar</th>
<th>yi-ybar</th>
<th>(xi-xbar)(yi-ybar)</th>
<th>(xi-xbar)^2</th>
</tr>
</thead>
<tbody>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
<tr>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
<td>.</td>
</tr>
</tbody>
</table>
<p>to calculate xbar and ybar.</p>
</section>
<section id="Example">
<ul>
<li>You can go to <a href="https://github.com/arunp77/Machine-Learning/tree/main/Projects-ML" target="_blank">following project</a> for a reference for linear regression analysis. </li>
</ul>
</section>
<!-------Reference ------->
<section id="reference">
<h2>References</h2>
<ul>
<li>My github Repositories on Remote sensing <a href="https://github.com/arunp77/Machine-Learning/" target="_blank">Machine learning</a></li>
<li><a href="https://mlu-explain.github.io/linear-regression/" target="_blank">A Visual Introduction To Linear regression</a> (Best reference for theory and visualization).</li>
<li>Book on Regression model: <a href="https://avehtari.github.io/ROS-Examples/" target="_blank">Regression and Other Stories</a></li>
<li>Book on Statistics: <a href="https://hastie.su.domains/Papers/ESLII.pdf" target="_blank">The Elements of Statistical Learning</a></li>
<li><a href="https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf">https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf</a></li>
</ul>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="machine-learning.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>