-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter7.R
685 lines (530 loc) · 29.4 KB
/
Chapter7.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
# This document belongs to the book "Using R for Introductory Statistics" by John Verzani.
# copyright (c) 2004 - Taylor and Francis
#
# In the document, foundations of statistical knowledge is practiced on R.
#
# written by Erkam Minsin
#
# last modified: Dec 2022
# first written: Dec 2022
setwd("C:/Users/erkam/R Learning/Using_R_for_Introductory_Statistics_John_Verzani")
## install.packages("UsingR")
library(UsingR)
## update.packages()
# Chapter 7 Confidence Intervals ---------------------------------------------
# in this chapter, we use probability models to make statistical inferences about the parent dist of a sample
# 7.1 Confidence interval ideas
# 7.1.1 Finding confidence intervals using simulation
pop <- rep(0:1, c(10000-5600, 5600))
phat <- mean(sample(pop, 100)) # without replacement
phat # the estimated proportion of 1 ("for" votes) in 100 samples selected from the population
res <- c()
for(i in 1:1000) res[i] = mean(sample(pop, 100)) # simulation of phat
quantile(res, c(0.1,0.9)) # 80% of the time; P(0.50 <= phat <= 0.63) = 0.80
quantile(res, c(0.05,0.95)) # 90% of the time; P(0.48 <= phat <= 0.64) = 0.90
quantile(res, c(0.025,0.975)) # 95% of the time; P(0.47 <= phat <= 0.66) = 0.95
# we interpret these by envisioning picking one of the 1000 samples at random and asking the probability that it is in that range
# this should be close to the true probability (p=0.56) that phat is in the range, as we have generated many realizations of phat
# let us rewrite
# if we specify a probability,
# then we can find an interval around a randomly chosen sample value, phat, that contains p with the specified probability
# this interval is called a confidence interval,
# as we have a certain confidence (given by the probability) that the parameter, p, is in this random interval
# 7.2 Confidence intervals for a population proportion, p
zstar <- -qnorm(alpha/2) # left tail
zstar <- qnorm(1-alpha/2) # right tail
alpha <- 2*pnorm(-zstar) # inverse relationship
# example
n = 1013
phat <- 466/n
SE <- sqrt(phat*(1-phat)/n)
alpha = 0.05
zstar <- -qnorm(alpha/2)
zstar # nearly 2 if doing by hand
c(phat-zstar*SE, phat+zstar*SE) # (0.4293,0.4907) ; the CI does not include 0.5
phat + c(-1,1)*zstar*SE # less typing for CI
# example : missing CI
# phat=0.57, n=1000, margin of error=0.03 what is alpha?
zstar <- 0.03/sqrt(0.57*(1-0.57)/1000)
zstar
alpha <- 2*pnorm(-zstar)
alpha
1-alpha # there is an implied 95 CI
# 7.2.1 Using prop.test() to find confidence intervals
# prop.test(x, n, conf.level=0.95, conf.int=TRUE)
# x : frequency ; n : sample size ; conf.level = 0.95 by default
prop.test(466, 1013, conf.level = 0.95)
# binom.test() will also find CI. In this case, it uses the binomial dist in place of the normal approximation
# Problems 7.2.2
# Question 7.1
# Question 7.2
# not everyone in the USA has a listed phone number
# Question 7.3
# because first not every Internet user choose to participate
# second not everyone in the population is an Internet user
# Question 7.4
# I would believe more in CNN's poll as sample size is way larger and may include all Zogby poll participants
# Question 7.5
n = 100 ; phat = 0.45
SE <- sqrt(phat*(1-phat)/n)
zstar <- -qnorm(0.20/2) # with 80% confidence ; alpha = 0.20
phat + c(-1,1)*zstar*SE
zstar <- -qnorm(0.10/2) # 90% confidence ; alpha = 0.10
phat + c(-1,1)*zstar*SE
prop.test(n*phat, n, conf.level = 0.80)
prop.test(n*phat, n, conf.level = 0.90)
# Question 7.6
n = 100 ; x = 5 ; p = 0.10
prop.test(x, n, conf.level = 0.95) # CI -> 0.01855256 - 0.11829946 ; YES, population parameter p = 0.10 is in the CI
# Question 7.7
n = 10 ; x = 9
prop.test(x, n, conf.level = 0.80) # CI -> 0.6577132 - 0.9901076
# Question 7.8
alpha = 0.05 ; SE*zstar = 0.02 ; phat = 0.54
zstar <- -qnorm(alpha/2)
zstar
n <- phat*(1-phat)*(zstar^2)/((0.02)^2) # n = 787
# Question 7.9
n = 5 ; x = 4 # thus phat = 0.80
prop.test(x, n, conf.level = 0.90) # CI -> 0.3493025 - 0.9861052
n = 100 ; phat = 0.80
prop.test(n*phat, n, conf.level = 0.90) # CI -> 0.7212471 - 0.8617706
n = 1000 ; phat = 0.80
prop.test(n*phat, n, conf.level = 0.90) # CI -> 0.7778789 - 0.8204633
# Question 7.10
n = 250 ; phat = 0.45
prop.test(n*phat, n, conf.level = 0.95) # CI -> 0.3876067 - 0.5139554
ME <- phat - 0.3876067 # or ME <- 0.5139554 - phat ; roughly 0.062
n = 1000 ; phat = 0.45
prop.test(n*phat, n, conf.level = 0.95) # CI -> 0.4189204 - 0.4814685
ME <- phat - 0.4189204 # or ME <- 0.4814685 - phat ; roughly 0.031
# CI for n=1000 is less than for n=250 as n increases, the CI gets less
# ME for n=1000 is half of ME for n=250 as margin of error is inversely proportional with sqrt(n)
# Question 7.11
# no matter what phat is phat(1-phat) is largest when phat = 1/2
# so we have n large enough if
alpha = 0.05
zstar <- -qnorm(alpha/2)
(zstar/0.01)^2/4 # n should be 9604
alpha = 0.20
zstar <- -qnorm(alpha/2)
(zstar/0.01)^2/4 # n should be 4106
# Question 7.12
m = 50; n = 20 ; p = 0.5
alpha = 0.50
zstar <- qnorm(1-alpha/2)
phat <- rbinom(m, n, p)/n # divide by n for proportions
SE <- sqrt(phat*(1-phat)/n)
sum(phat-zstar*SE<p & p<phat+zstar*SE)/m
matplot(rbind(phat - zstar*SE , phat + zstar*SE),
rbind(1:m , 1:m), type = "l", lty = 1)
abline(v = p) # indicate parameter value
# 7.3 Confidence intervals for the population mean, mu
# the success of finding a CI for p in terms of phat depended on knowing the sampling dist of phat once we standardized it
# we can use the same approach to find a CI for mu, the population mean, from the sample mean Xbar
# if sigma IS known (population data IS known) and n IS large enough
# the central limit theorem : Z = Xbar-mu / sigma/sqrt(n), z-scores for each Xbar (Z), will have an approximately normal dist
# result: mu is in the interval Xbar +/- 2*sigma/sqrt(n) with probability 0.95
# if sigma IS NOT known (population data IS NOT known) and n IS still large enough
# replace the unknown population sd, sigma, by the known sample sd, s
# the central limit theorem still applies : T = Xbar-mu / s/sqrt(n) , T has a sampling dist approximately normal
# result: mu is in the interval Xbar +/- 1.96*s/sqrt(n) with probability 0.95
# if sigma MAY BE known (population data MAY BE known) but n IS NOT large enough
# In this case, sampling dist of T is the t-distribution with n-1 df
# t-distribution is a symmetric, bell-shaped dist that asymptotically approaches the standard normal dist but for small n has fatter tails
# the df, n-1, is a parameter for this dist the way the mean and sd are for the normal dist
# t.test(x, conf.level=0.95) will compute the confidence intervals
# x : data stored in data vector x
tstar <- qt(1-alpha/2, df = n-1)
alpha <- 2*pt(-tstar, df = n-1)
zstar <- qnorm(1-alpha/2)
alpha <- 2*pnorm(-zstar)
# example
Xbar = 66; s = 4; n = 30
alpha = 0.20
tstar <- qt(1-alpha/2, df = n-1)
SE <- s/sqrt(n)
c(Xbar - tstar*SE, Xbar + tstar*SE)
# example
ozs <- c(1.95,1.80,2.10,1.82,1.75,2.01,1.83,1.90)
qqnorm(ozs) # approximately linear
t.test(ozs, conf.level = 0.80) # CI -> 1.836 - 1.954
# the T-statistic robust :
# sampling dist of T is the t-dist with n-1 df. This is true when the Xi are i.i.d. normal
# what if the Xi are not normal?
# if n is small, we can do simulations to see that the dist of T still approximately the t-dist if the parent dist of the Xi is not too far from normal
# that is, the tails can't be too long, or the skew can't be too great
# when n is large, the central limit theorem applies
# a statistic whose sampling distribution doesn't change dramatically for moderate changes in the population dist is called a robust statistic
# 7.3.1 One-sided confidence intervals
# prop.test(), binom.test(), t.test() can return one-sided confidence intervals
# when argument alt="less" is used, an interval of the type (-Inf,b] is printed
# similarly, when alt="greater" is used, an interval of the type [b,Inf) is printed
# example
x <- c(175,185,170,184,175)
t.test(x, conf.level = 0.90, alt = "less") # 90 percent confidence interval: -Inf - 182.2278
# Problems 7.3.2
# Question 7.13
Xbar = 9.5 ; s = 1 ; n = 15
alpha = 0.1
tstar <- qt(1 - alpha/2, df = n-1)
SE <- s/sqrt(n)
c(Xbar - tstar*SE, Xbar + tstar*SE) # CI for n=15 -> 9.045 - 9.955 ; 10 days is not in this interval
# Question 7.14
with(stud.recs, # data set is stud.recs, data vector x is sat.m
t.test(sat.m, conf.level = 0.90)) # mean of x : 485.9375 ; CI for df=159 -> 476.8953 - 494.9797
# Question 7.15
with(homedata, # data set is homedata, data variable is y1970
t.test(y1970, conf.level = 0.90)) # mean of y1970 : 70820.93 ; CI for df=6840 -> 70377.72 - 71264.14
with(homedata, # data set is homedata, data variable is y2000
t.test(y2000, conf.level = 0.90)) # mean of y2000 : 268369.8 ; CI for df=6840 -> 265769.5 - 270970.0
# Question 7.16
# 5-year-olds = 60-72-months-old
ind <- kid.weights$age < 72 & kid.weights$age >= 60
t.test(kid.weights$weight[ind], conf.level = 0.90)
# Question 7.17
hist(brightness)
t.test(brightness, conf.level = 0.90) # mean of brightness : 8.417743 ; CI for df=965 -> 8.349184 - 8.486303
# Question 7.18
with(normtemp, hist(temperature))
t.test(normtemp$temperature, conf.level = 0.90) # mean of temperature : 98.24923 ; CI for df=129 -> 98.14269 - 98.35577. It doesn't include 98.6 F
# Question 7.19
Xbar <- 67.5 ; s = 2.54 ; n = 4
alpha = 0.05
tstar <- qt(1-alpha/2, df = n-1)
SE <- s/sqrt(n)
c(Xbar - tstar*SE, Xbar + tstar*SE) # CI for df=3 -> 63.45829 - 71.54171
# Question 7.20
n = 10 ; m = 250 ; df = n-1
res <- c()
for(i in 1:m) {
x = rnorm(n, mean=0, sd=1) # change this line only
res[i] = (mean(x)-0)/(sd(x)/sqrt(n))
}
qqplot(res, rt(m, df=df)) # compare the dist of the sample with a sample from the t-distribution
# close to linear, which means that sampling distribution is the t-distribution
n = 10 ; m = 250 ; df = n-1
res <- c()
for(i in 1:m) {
x = runif(n)-1/2
res[i] = (mean(x)-0)/(sd(x)/sqrt(n))
}
qqplot(res, rt(m, df=df)) # short-tailed
n = 10 ; m = 250 ; df = n-1
res <- c()
for(i in 1:m) {
x = rt(n,3)
res[i] = (mean(x)-0)/(sd(x)/sqrt(n))
}
qqplot(res, rt(m, df=df)) # symmetric, long-tailed
n = 10 ; m = 250 ; df = n-1
res <- c()
for(i in 1:m) {
x = rt(n,30)
res[i] = (mean(x)-0)/(sd(x)/sqrt(n))
}
qqplot(res, rt(m, df=df)) # symmetric, not so long-tailed
n = 10 ; m = 250 ; df = n-1
res <- c()
for(i in 1:m) {
x = rexp(n)-1
res[i] = (mean(x)-0)/(sd(x)/sqrt(n))
}
qqplot(res, rt(m, df=df)) # skewed
# Question 7.21
n = 3 # for n=3
boxplot(rt(1000, df = n-1), rnorm(1000)) # compare the tail of t-distribution with normal distribution
x <- seq(0,1, length = 150)
plot(qt(x, df = n-1), qnorm(x)) # compare the quantiles of t-distribution with normal distribution
abline(0,1)
curve(dnorm(x), -3.5, 3.5)
curve(dt(x, df = n-1), lty = 2, add = TRUE) # compare the densities of t-distribution with normal distribution
n = 10 # for n=10
boxplot(rt(1000, df = n-1), rnorm(1000))
x <- seq(0,1, length = 150)
plot(qt(x, df = n-1), qnorm(x))
abline(0,1)
curve(dnorm(x), -3.5, 3.5)
curve(dt(x, df = n-1), lty = 2, add = TRUE)
n = 25 # for n=25 ; seems large enough to say the two dists are essentially the same
boxplot(rt(1000, df = n-1), rnorm(1000))
x <- seq(0,1, length = 150)
plot(qt(x, df = n-1), qnorm(x))
abline(0,1)
curve(dnorm(x), -3.5, 3.5)
curve(dt(x, df = n-1), lty = 2, add = TRUE)
n = 50 # for n=50
boxplot(rt(1000, df = n-1), rnorm(1000))
x <- seq(0,1, length = 150)
plot(qt(x, df = n-1), qnorm(x))
abline(0,1)
curve(dnorm(x), -3.5, 3.5)
curve(dt(x, df = n-1), lty = 2, add = TRUE)
n = 100 # for n=100
boxplot(rt(1000, df = n-1), rnorm(1000))
x <- seq(0,1, length = 150)
plot(qt(x, df = n-1), qnorm(x))
abline(0,1)
curve(dnorm(x), -3.5, 3.5)
curve(dt(x, df = n-1), lty = 2, add = TRUE)
# Question 7.22
m = 200 ; n = 10 ; sigma = 2 ; mu = 0 ; alpha = 0.10
SD <- c() # confidence intervals using the dist of Z (population, sigma)
SE <- c() # confidence intervals using the dist of T (sample, s)
for (i in 1:m) {
X <- rnorm(n, mean=mu, sd=sigma)
SD[i] <- qnorm(1-alpha/2)*sigma/sqrt(n)
SE[i] <- qt(1-alpha/2, df = n-sigma)*sd(X)/sqrt(n)
}
100 * (sum(SE > SD) / m) # the percentage of the time the CI using SD(Xbar) was smaller than usind SE(Xbar)
# 7.4 Other confidence intervals
# to form confidence intervals, we have used the key fact that certain statistics,
# (phat-p) / sqrt(phat(1-phat)/n) and (Xbar-mu) / s/sqrt(n) ,
# have known sampling distributions that do not involve any population parameters
# from this, we could then solve for confidence intervals for the parameter in terms of known quantities
# in general, such a statistic is called a pivotal quantity and can be used to generate number of confidence intervals in various situations
# 7.4.1 Confidence interval for sigma^2
# for example, if the Xi are i.i.d. normal, then the distribution of (n-1)*s^2 / sigma^2 is known to be the chi-squared distribution with n-1 df
# this allows us to solve for confidence intervals for sigma^2 in terms of the sample variance s^2
# for a given alpha : P(lstar <= (n-1)*s^2 / sigma^2 <= rstar) = 1-alpha
# given that lstar and rstar yield equal areas in the tails,
# the interval ((n-1)*s^2/rstar, (n-)*s^2/lstar) gives a (1-alpha)100% confidence interval for sigma^2
# example
s2 = 12 ; n = 10 ; alpha = 0.05
lstar <- qchisq(alpha/2, df = n-1)
rstar <- qchisq(1-alpha/2, df = n-1)
(n-1)*s2*c(1/rstar,1/lstar) # CI for sigma^2
sqrt((n-1)*s2*c(1/rstar,1/lstar)) # CI for sigma
# Problems 7.4.2
# Question 7.23
# let X1,X2,..., Xn and Y1,Y2,...,Ym be two i.i.d. samples with sample variances s_x and s_y
# a CI for the equivalence of sample variances can be given from the F-statistic, (s_x^2/sigma_x^2) / (s_y^2/sigma_y^2)
# if the underlying Xi and Yi are normally distributed,
# then the distribution of F is known to be the F-distribution with n-1 and m-1 degrees of freedom
# that is, F is a pivotal quantity, so probability statements,
# such as P (a <= (s_x^2/sigma_x^2) / (s_y^2/sigma_y^2) <= b) can be answered with the known quantiles of the F distribution
# example
n = 11 ; m = 16 ; alpha = 0.10
qf(c(alpha/2 , 1-alpha/2), df1 = n-1, df2 = m-1) # says that P(0.3515 <= (s_x^2/sigma_x^2) / (s_y^2/sigma_y^2) <= 2.5437) = 0.90 when n=11, m=16
n = 10 ; m = 20 ; alpha = 0.20
s_x = 2.3 ; s_y = 2.8
qf(c(alpha/2 , 1-alpha/2), df1 = n-1, df2 = m-1) # P(0.434 <= (s_x^2/sigma_x^2) / (s_y^2/sigma_y^2) <= 1.984) = 0.80
upperbound <- sqrt(1/0.434)*(s_x/s_y)
lowerbound <- sqrt(1/1.984)*(s_x/s_y) # P(0.583 < sigma_x / sigma_y < 1.247) = 0.80
# Question 7.24
# assume our data, X1, X2,..., Xn is uniform on the interval [0,teta] (teta is an unknown parameter)
# set max(X) to the be maximum value in the data set
# then the quantity max(X)/teta is pivotal with distribution P(max(X)/teta < x) = x^n , 0 <= x <= 1
# thus P(max(X)/x < teta) = x^n
# as teta is always bigger than max(X), we can solve for x^n=alpha and get that teta is in the interval [max(X),max(X)/x] with probability 1-alpha
attach(nym.2002)
n <- length(place)
alpha = 0.10
(1-alpha)^(1/n)
max(place)/(1-alpha)^(1/n) # 23664
max(place) # 23662
detach(nym.2002)
# so it is 90% certain that teta is between 23662 and 23664 (the real answer is 23664)
# 7.5 Confidence intervals for differences
# 7.5.1 Difference of proportions
# to see if a difference in the proportions is explainable by sampling error,
# we look at phat1-phat2 and find a confidence interval for p1-p2
# this can be done, as the statistic Z = (phat1-phat2)-(p1-p2) / SE(phat1-phat2) is a pivotal quantity
# with standard normal distribution when n1 and n2 are large enough
# the SE is SE(phat1-phat2) = sqrt(phat1(1-phat1)/n1 + phat2(1-phat2)/n2)
# Z has an asymptotic normal distribution
# prop.test() can do the calculations
# example
prop.test(x = c(560,579), n = c(1000,1200), conf.level = 0.95)
# prop1=0.56, prop2=0.4825 ; CI at 95% confidence level for n=1000 and n=1200 -> 0.03479986 - 0.12020014
# the CI misses including 0 (the mean of the standard normal dist, estimated dist of Z)
# thus, we conclude that there appears to be a real difference in the population parameters
# 7.5.2 Difference of means
# to see whether independent samples come from identical parent populations
# assume the populations for each sample are normally distributed
# the sampling dist of Xbar-Ybar is asymptotically normal as each is asymptotically normal
# consequently, T = (Xbar-Ybar)-(Mu_x-Mu_y) / SE(Xbar-Ybar), pivotal, will have an approximately normal dist, with mean=0 and var=1 fro large n_x and n_y
# for small n_x and n_y , T will have the t-distribution
# the SE(Xbar-Ybar) and degrees of freedom are computed differently depending on whether or not the population variances are equal
# t.test(x, y, var.equal=FALSE, conf.level=0.95) computes the CI of the difference of means of two samples to see if they come from the same population
# example
x <- c(0,0,0,2,4,5,13,14,14,14,15,17,17)
y <- c(0,6,7,8,11,13,16,16,16,17,18)
boxplot(list(drug=x, placebo=y), col = "gray") # compare spreads ; the medians are the same; the spreads are different; the means to be checked
t.test(x, y, var.equal = TRUE, conf.level = 0.95)
# results of equal variances assumption: t=-1.0542, df=22, p-value=0.3032 ; CI -> -8.279119 - 2.698699 ; mean of x=8.846154, mean of y=11.636364
t.test(x, y, var.equal = FALSE, conf.level = 0.95)
# results of different variances assumption: t=-1.0722, df=21.99, p-value=0.2953 ; CI -> -8.187298 - 2.606878 ; mean of x=8.846154, mean of y=11.636364
# overall results : the dfs are essentially identical for both cases (equal and unequal variances)
# the difference of 0 is still in the 95% CIs, even though the sample means differ quite a bit at first glance (8.846 vs 11.636), which means
# the difference in means may be explained by sampling error
# 7.5.3 Matched samples
# used for independent samples (paired or matched samples)
# T statistic, T = (Xbar-Ybar)-(Mu_x-Mu_y) / SE(Xbar-Ybar) is the pivotal quantity with a t-distribution
# what is standard error here as the samples are not independent?
# it is just the SE for the single sample Xi and Yi
# t.test() can compute the CI. If x and y store the data t.test(x, y, paired=TRUE, conf.level=0.95) or shortly : t.test(x-y)
# example
library(MASS)
names(shoes)
with(shoes, t.test(A-B, conf.level = 0.90)) # or
with(shoes, t.test(A, B, paired = TRUE, conf.level = 0.90)) # it does't include 0, indicating that there may be a difference in the means
# Problems 7.5.4
# Question 7.25
# assumption 1 : parent populations are normally distributed
# assumption 2 : sample sizes (seven) are large enough
x <- c(3.1,3.3,1.7,1.2,0.7,2.3,2.9)
y <- c(1.8,2.3,2.2,3.5,1.7,1.6,1.4)
boxplot(list(cocktail1=x,cocktail2=y), col="gray") # compare spreads
t.test(x, y, var.equal = FALSE, conf.level = 0.80)
# CI -> -0.5322402 - 0.7322402 , includes 0, there may not be a difference in the means ; the means : x=2.171429, y=2.071429
t.test(x, y, var.equal = TRUE, conf.level = 0.80)
# CI -> -0.5281055 - 0.7281055 , includes 0, there may not be a difference in the means ; the means : x=2.171429 , y=2.071429
# Question 7.26
# assumption : parent populations are normally distributed
x <- c(7,0,8,1,10,12,2,9,5,2)
y <- c(2,1,5,1,5,7,-1,8,7,3)
boxplot(list("400mg"=x, "1200mg"=y), col="gray") # compare spreads
t.test(x, y, var.equal = FALSE, conf.level = 0.90)
# CI -> -1.058279 - 4.658279 , includes 0, there may not be a difference in the means ; the means : x=5.6, y=3.8
t.test(x, y, var.equal = TRUE, conf.level = 0.90)
# CI -> -1.043488 - 4.643488 , includes 0, there may not be a difference in the means ; the means : x=5.6 , y=3.8
# Question 7.27
# this data is paired off, as it is reasonable to assume that there is a correlation between the IQ scores of twins
# assumption : differences in the scores can be viewed as a random sample from a normally distributed population
x <- c(80,88,75,113,95,82,97,94,132,108)
y <- c(90,91,79,97,97,82,87,94,131,115)
plot(x,y)
boxplot(x-y)
# the exploratory plots show the correlation between the two sets of scores and the distribution of the differences,
# indicating that a paired t-test is appropriate
t.test(x, y, paired = TRUE, conf.level = 0.90) # CI for the differences of mean -> -4.368937 - 4.568937, including 0, no difference is possible
# Question 7.28
attach(babies)
plot(age, dage) # seems they are correlated, thus the samples can be matched
boxplot(age, dage) # distribution of the variables
t.test(age, dage, paired = TRUE, conf.level = 0.95)
# CI for the differences of mean -> -3.745356 - -2.986035, not including 0, there should be a difference in the means
detach(babies)
# 7.6 Confidence intervals for the median
# used when the sample is an independent sample but not taken from a normal or close to normal population
# for these situations, nonparametric methods are preferred
# 7.6.1 Confidence intervals based on the binomial
# let T count the number of data points more than the median in a sample of size n
# T is a Binomial(n,0.5) random variable
# let X1,X2,...,Xn be the sample after sorting from smallest to largest
# find the largest j so that P(j <= T <= n-j) > 1-alpha , which in turn becomes a search for the largest j with P(T<j)<alpha/2
# example
x <- c(110,12,2.5,98,1017,540,54,4.3,150,432)
n <- length(x)
pbinom(0:n, n, 0.5) # P(T <= k) here, not P(T < k)
# [1] 0.0009765625 0.0107421875 0.0546875000 0.1718750000 0.3769531250
# [6] 0.6230468750 0.8281250000 0.9453125000 0.9892578125 0.9990234375
# [11] 1.0000000000
# for 90% CI, alpha/2 = 0.05, thus largest j that satisfies P(T < j) < 0.05 is 2
# so, 90% CI that fulfills P(Xj <= M <= X(n+1-j) >= 1-alpha is [4.3,540]
# shortly
j <- qbinom(0.05, n, 0.5)
sort(x)[c(j,n+1-j)] # 4.3 540.0
# 7.6.2 Confidence intervals based on signed-rank statistic
# the Wilcoxon signed-rank statistic allows for an improvement on the CI given by the number of data points above the median
# its usage is valid when the Xi are assumed to be symmetric about their median
# if this is so, then a data point is equally likely to be on the left and right of the median,
# and the distance from the median is independent of what side of the median the data point is on
# if we know the median then we can rank the data by distance to the median
# In R, it is available under the family name signrank
# qsignrank() : returns the quantiles
# the procedure is implemented in the wilcox.test()
# we need to specify that a CI is desired with the argument conf.int=TRUE
# example
x <- c(110,12,2.5,98,1017,540,54,4.3,150,432)
boxplot(list("CEO"=scale(x), "log.CEO"=scale(log(x)))) # scale() makes a data set have mean 0 and variance 1
title("Boxplot of CEO data and its logarithm")
wilcox.test(log(x), conf.int = TRUE, conf.level = 0.90) # CI -> 2.963463 - 5.539530 , (pseudo)median = 4.344732
exp(c(2.963463,5.539530)) # the new CI -> [19.37,254.56] from [4.3,540.0]
# 7.6.3 Confidence intervals based on the rank-sum statistic
# if the two distributions are the same up to a possible shift of center, then a confidence interval based on a nonparametric statistic can be given
# let f(x) be a density for a mean-zero distribution, and suppose we have two independent random samples:
# the first, X1,X2...,Xn_x from a population with density f(x-mu_x),
# and the second,Y1,Y2,...,Yn_y from a population with density f(x-mu_y)
# the basic statistic, called the rank-sum statistic,
# looks at all possible pairs of the data and counts the number of times the X value is greater than or equal to the Y value
# if the population mean for the X values is larger than the population mean for the Y values, this statistic will likely be large
# if the mean is smaller, then the statistic will likely be small
# the distribution of this statistic is given by R with the wilcox family
# and is used to give a confidence interval for the difference of the means
# the command wilcox.test (x, y, conf . int=TRUE). function will find a confidence interval for the difference in medians of the two data sets
# example
pay.00 <- c(110,12,2.5,98,1017,540,54,4.3,150,432)
pay.02 <- c(312,316,175,200,92,201,428,51,289,1126,822)
plot(density(pay.02), main = "densities of y2000,y2002")
lines(density(pay.00),lty=2)
# shows two data sets that are quite skewed, so confidence intervals based on the T statistic would be inappropriate
# rather, as the two data sets have a similar shape, we find the confidence interval returned by wilcox.test()
wilcox.test(pay.02, pay.00, conf.int = TRUE, conf.level = 0.9)
# the 90% confidence interval, [-18,282], contains 0 value of ()
# this example would be improved if we had matched or paired data—that is, the salaries for the same set of CEOs in the year 2000 and 2002 - as then
# differences in the sampling would be minimized
# if that case is appropriate, then adding the argument paired=TRUE to wilcox.test() computes a confidence interval based on the signed-rank statistic
# Problems 7.6.4
# Question 7.29
x <- c(21,21,21,22,22,23,23,24,24,24,24,24,24,25,25,26,29,31,31,33)
stem(x) # the data set implies that the population distribution is skewed and not normally distributed or symmetric
stem(log(x)) # thus t.test() and wilcox.test() are not appropriate. However, after a log transform, it appears that wilcox.test() will be
wilcox.test(log(x), conf.int = TRUE, conf.level = 0.90)
exp(c(3.134569,3.262468)) # 90% CI for the median -> 22.97873 - 26.11391 ; median: 24
# Question 7.30
attach(cabinet)
hist(est.tax.savings) # skewed, not normally distributed or symmetric
hist(log(est.tax.savings)) # more symmetric
wilcox.test(log(est.tax.savings), conf.int = TRUE, conf.level = 0.90)
exp(c(7.240808,9.736620))
# 90% CI for the median -> 1395.221 - 16926.234 ; median : 5133.485
detach(cabinet)
# Question 7.31
# we might envision the collection of songs on an album as coming from a population of all possible songs the band could have written at the given time
# a difference of means would indicate a tendency to write longer songs at one of the given times
wilcox.test(u2$October, u2$"The Joshua Tree", conf.int = TRUE, conf.level = 0.95)
# 95% CI for the difference in median between the albums October and "The Joshua Tree" is [-94 , -10]
# Question 7.32
hist(cfb$AGE) # symmetric
wilcox.test(cfb$AGE, conf.int = TRUE, conf.level = 0.95) # [48 , 50] ; median : 49
hist(cfb$INCOME) # no symmetry
hist(log(cfb$INCOME+1)) # symmetric
wilcox.test(log(cfb$INCOME+1), conf.int = TRUE, conf.level = 0.95)
exp(c(10.49,10.61)) - 1 # [35953 , 40537] ; median : 38127.6
# Question 7.33
# we can simulate the signed-rank distribution and see that it applies for any symmetric distribution regardless of tail length
n = 20 ; m = 250 # 250 samples
res = c() # the results
for(i in 1:m) {
x = rnorm(n)
res[i]=sum(rank(abs(x))[x>0]) # only add positive values
}
hist(res,prob=TRUE)
x = 40:140
lines(x,dsignrank(x,n)) # density-like, but discrete
res = c()
for(i in 1:m) {
x = rt(n, df = 2)
res[i]=sum(rank(abs(x))[x>0])
}
hist(res,prob=TRUE)
x = 40:140
lines(x,dsignrank(x,n))
res = c()
for(i in 1:m) {
x = runif(n, min = -1, max = 1)
res[i]=sum(rank(abs(x))[x>0])
}
hist(res,prob=TRUE)
x = 40:140
lines(x,dsignrank(x,n))
res = c()
for(i in 1:m) {
x = rexp(n, rate = 1) -1
res[i]=sum(rank(abs(x))[x>0])
}
hist(res,prob=TRUE)
x = 40:140
lines(x,dsignrank(x,n))