应用回归分析上机

R 程序常用函数

  • anova(object, …) # 计算方差分析表, 返回模型的方差分析表, 其中的 object 是有 lm 或者 glm得到的对象.
  • coefficients(object, …) # 提取模型系数, 返回模型的系数, 其中 object 是由模型构成的对象, 简写 coef().
  • deviance(object, …) # 计算残差平方和, 返回模型的残差平方和, 其中 object 是由模型构成的对象.
  • formula(object, …) # 提取模型公式, 返回模型过时, 其中 object 是由模型构成的对象
  • plot(object, …) # 不多说了, 画图工具
  • predict(object, newdata=data.frame) # 作预测, 返回预测值和预测区间, 其中 object 是由 lm 构成的对象, newdata 是预测点的数据, 由数据框形式输入.
  • print(object, …) # 返回模型拟合的结果, 其中 object 是由模型构成的对象, 一般不用, 而是直接输入对象名称来显示.
  • residuals(object, type=c(“working”, “repsonse”, “deviance”, “pearson”, “partial”)) # 计算残差, 返回模型的残差, 其中 object 是由 lm 或 aov 构成的对象, type 是返回值的类型.
  • step(object, …) # 作逐步回归, 返回逐步回归, 根据 AIC(Akaike’s An Information Criterion)的最小值选择模型, 其中 object 是由 lm 或 glm 构成的对象.
  • summary(object, …) # 提取模型资料, 返回较为详细的模型拟合结果, 其中 object 是由 lm 构成的对象.

课后题答案:http://doc.mbalib.com/view/a1daa9f2038303b4345688378f95b203.html

一元线性回归

#第一题
x=c(3.4,1.8,4.6,2.3,3.1,5.5,0.7,3.0,2.6,4.3,2.1,1.1,6.1,4.8,3.8)
x=sort(x)
epsilon=rnorm(15,mean=0,sd=2)
y_i=10+4*x+epsilon
y_i
plot(x,y_i,type="l")
#第二题
para=lm(y_i~x+1)
print(para$coef[[1]])
print(para$coef[[2]])
#第三题
for(i in 1:10){
  counter=counter+1
  epsilon=rnorm(15,mean=0,sd=4)
  y_i=10+4*x+epsilon
  para=lm(y_i~x+1)
  print(para$coef[[1]])
  print(para$coef[[2]])
}

2.14

# Q1
x=c(1,2,3,4,5,6,7,8,9,10)
y=c(3.5,1,4,2,1,3,4.5,1.5,3,5)
plot(y)

# Q2 Q3 大致呈线性关系
model=lm(y~x+1)
result=summary(model)
plot(model)
coefficients(model) #取出相关系数

#  Q4 Residual standard error
result
# 或者
sqrt(sum(residuals(model)^2)/(10-2))

# Q5, Q6
# Multiple R-squared
result

# Q7
# F-statistic 大于 p-value, 则拒绝原假设, 说明x与y有显著地线性关系
result
anova(model)

# Q8
# beta_1 的P值为 0.0354<alpha=0.5, 则 beta_1 显著
result

# Q9
cor.test(result)

# Q10
plot(residuals(model))
shapiro.test(residuals(model)) #正态分布检验

2.15

# Q1
x=c(1,2,3,4,5,6,7,8,9,10)
y=c(3.5,1,4,2,1,3,4.5,1.5,3,5)
plot(y)

# Q2 大致呈线性关系
para=lm(y~x+1)
result=summary(para)
plot(para)
coefficients(para) #取出相关系数

# Q3 Q4 Q5
# Residual standard error
result

# Q6
# Multiple R-squared
result

# Q7
# F-statistic 大于 p-value, 则拒绝原假设, 说明x与y有显著地线性关系
result
anova(para)

# Q8
# belta_1 的P值为 0.0354>alpha=0.5, 则 belta_1 显著
result

# Q9
cor.test(result)

# Q10
plot(residuals(para))
shapiro.test(residuals(para)) #正态分布检验

2.16

# Q1
x=c(3356,3114,3554,4542,4669,4888,5710,5536,4168,3547,3159,3621,3782,4247,3982,3568,3155,3059,2967,3285,3914,4517,4349,5020,3594,2821,3366,2920,2980,3731,2853,2533,2729,2305,2642,3124,2752,3429,3947,2509,5440,4042,3402,2829,2297,2932,3705,4123,3608,8349,3766)
y=c(19583,20263,20325,26800,29470,26610,30678,27170,25853,24500,24274,27170,30168,26525,27360,21690,21974,20816,18095,20939,22644,24624,27186,33990,23382,20627,22795,21570,22080,22250,20940,21800,22934,18443,19538,20460,21419,25160,22482,20969,27224,25892,22644,24640,22341,25610,26015,25788,29132,41480,25845)
plot(x,y)

# Q2
para=lm(y~x+1)
result=summary(para)
plot(para)
coefficients(para) #取出相关系数

(1)从某个已知正态分布产生随机数,得到其均值的区间估计(置
信水平去 0.95),将此过程重复 n=100 次,将所得 100 个区间作
图;
(2) n 取不同值,计算所得区间覆盖真值的频率 pn, 以 n 为横轴,
pn 为纵轴做曲线图,观察获得的趋势图。

sample_num=100
n=20
norm_mean=9
norm_sd=2
alpha_half=1.96
interval=NULL

for (i in rep(1,sample_num)){
    y=rnorm(n,mean=norm_mean,sd=norm_sd)
    
    y_mean=mean(y)
    delta=norm_sd*(sd/sqrt(n))

    interval=rbind(interval,c(y_mean-delta, y_mean+delta))
}

df=data.frame(id=seq(1,sample_num),lower=interval[,1],upper=interval[,2])
p=ggplot(df,aes(lower,upper))
p+geom_errorbar(aes(ymin=lower,ymax=upper),width=0.2)

4.9

x=c(679,292,1012,493,582,1156,997,2189,1097,2078,1818,1700,747,2030,1643,414,354,1276,745,435,540,874,1543,1029,710,1434,837,1748,1381,1428,1255,1777,370,2316,1130,463,770,724,808,790,783,406,1242,658,1746,468,1114,413,1787,3560,1495,2221,1526)
y=c(0.79,0.44,0.56,0.79,2.7,3.64,4.73,9.5,5.34,6.85,5.84,5.21,3.25,4.43,3.16,0.5,0.17,1.88,0.77,1.39,0.56,1.56,5.28,0.64,4,0.31,4.2,4.88,3.48,7.58,2.63,4.99,0.59,8.19,4.79,0.51,1.74,4.10,3.94,0.96,3.29,0.44,3.24,2.14,8.71,0.64,1.9,0.51,8.33,14.94,5.11,3.85,3.93)
# 回归
plot(x,y)
model=lm(y~x+1)
#模型预测
predict_y=predict(model,data.frame(x))
# 提取系数并画出回归曲线
w=coefficients(model)
x_=seq(0,3500,10)
y_=w[2]*x_+w[1]
lines(x_,y_)
# 计算残差并画出残差图
E=predict_y-y
plot(E)
# 加权回归并画图
plot(x,y)
weighted_model=lm(y~x+1,weights=1/E^2)
w=coefficients(weighted_model)
x_=seq(0,3500,10)
y_=w[2]*x_+w[1]
lines(x_,y_)

 

4.13

x=c(127.3,130,132.7,129.4,135,137.1,141.1,142.8,145.5,145.3,148.3,146.4,150.2,153.1,157.3,160.7,164.2,165.6,168.7,172)
y=c(20.96,21.4,21.96,21.52,22.39,22.76,23.48,23.66,24.1,24.01,24.54,24.28,25,25.64,26.46,26.98,27.52,27.78,28.24,28.78)

# 回归
plot(x,y)
model=lm(y~x+1)

#模型预测
predict_y=predict(model,data.frame(x))

# 提取系数并画出回归曲线
w=coefficients(model)
x_=seq(0,170,2)
y_=w[2]*x_+w[1]
lines(x_,y_)

# 计算残差并画出残差图
E=predict_y-y
plot(E)
# 加权回归并画图
plot(x,y)
weighted_model=lm(y~x+1,weights=1/E^2)
w=coefficients(weighted_model)
x_=seq(0,3500,10)
y_=w[2]*x_+w[1]
lines(x_,y_)

5.9

参考: https://blog.csdn.net/li603060971/article/details/49508195

x_0=seq(1978,1998,1)
x_1=c(1018.4,1258.9,1359.4,1545.6,1761.6,1960.8,2295.5,2541.6,2763.9,3204.3,3831.0,4228.0,5017.0,5288.6,5800.0,6882.1,9457.2,11993.0,13844.2,14211.2,14599.6)
x_2=c(1607.0,1769.7,1996.5,2048.4,2162.3,2375.6,2789.0,3448.7,3967.0,4585.8,5777.2,6484.0,6858.0,8087.1,10284.5,14143.8,19359.6,24718.3,29082.6,32412.1,33429.8)
x_3=c(138.2,143.8,195.5,207.1,220.7,270.6,316.7,417.9,525.7,665.8,810.0,794.0,859.4,1015.1,1415.0,2284.7,3012.6,3819.6,4530.5,4810.6,5262.0)
x_4=c(96259,97542,98705,100072,101654,103008,104357,105851,107507,109300,111026,112704,114333,115823,117171,118517,119850,121121,122389,123626,124810)
x_5=c(2239.1,2619.4,2976.1,3309.1,3637.9,4020.5,4694.5,5773.0,6542.0,7451.2,9360.1,10556.5,11365.2,13145.9,15952.1,20182.1,26796.0,33635.0,40003.9,43579.4,46405.9)
x_6=c(50760,39370,44530,39790,33130,34710,31890,44370,47140,42090,50870,46990,38470,55470,51330,48830,55040,45821,46989,53429,50145)
y=c(1132.3,1146.4,1159.9,1175.8,1212.3,1367.0,1642.9,2004.8,2122.0,2199.4,2357.2,2664.9,2937.1,3149.5,3483.4,4349.0,5218.1,6242.2,7408.0,8651.1,9876.0)

model=lm(y~x_1+x_2+x_3+x_4+x_5+x_6+1) # 多元线性回归
summary(model)
# 前进法选择自变量
forward=step(model,direction="forward")
summary(forward)
# 后退法选择自变量
backward=step(model,direction="backward")
summary(backward)
# 逐步回归选择自变量
both=step(model,direction="both")
summary(both)

#计算方差扩大因子VIF
library(car)
vif(model)#计算得方差扩大因子

9.2

R语言非线性回归教程:https://www.w3cschool.cn/r/r_nonlinear_least_square.html

x=c(1000.0,2000.0,3000,3500.0,4000.0,4500.0,5000.0)
y=c(5.2,6.5,6.8,8.1,10.2,10.3,13.0)
model=nls(y~a*(x^2)+b,start=list(a=1,b=1)) # 对于一些简单的模型,nls函数可以自动找到合适的参数初值
cor(y, predict(model)) # 计算模型的拟合优度
plot(x, y)  # 将结果可视化
lines(x, predict(model), lty = 2, col = "red", lwd = 3)

9.3

x=c(4.2,4.06,3.8,3.6,3.4,3.2,3,2.8,2.6,2.4,2.2,2,1.8,1.6,1.4)
y=c(0.086,0.09,0.1,0.12,0.13,0.15,0.17,0.19,0.22,0.24,0.35,0.44,0.62,0.94,1.62)
plot(x,y)
# 加性误差项直接应用最小二乘估计
model=nls(y~a*exp(b/x),start=list(a=1,b=1)) # 对于一些简单的模型,nls函数可以自动找到合适的参数初值
cor(y, predict(model)) # 计算模型的拟合优度
lines(x, predict(model), lty = 2, col = "red", lwd = 3)

#乘性误差项, 对方程两边同时取自然对数后再应用最小二乘估计
y=log(y)
plot(x,y)
model=nls(y~a+b/x,start=list(a=1,b=1)) # 对于一些简单的模型,nls函数可以自动找到合适的参数初值
cor(y, predict(model)) # 计算模型的拟合优度
lines(x, predict(model), lty = 2, col = "red", lwd = 3)

9.4

y=c(7.5,9.8,11.4,13.3,17.2,20.6,29.1,34.6,47.4,55.5,59.6,62.2,66.5,72.7,77.2,82.4,85.4,86.8,87.2)
x=seq(1,length(y),1)
plot(x,y)
# 非线性最小二乘法
model=nls(y~1/(1/u+a*b^x),start=list(a=1,b=0.1,u=100)) 
cor(y, predict(model)) 
lines(x, predict(model), lty = 2, col = "red", lwd = 3)

# 非线性线性化
y=log(1/y)
plot(x,y)
model=lm(y~x)
cor(y, predict(model)) 
lines(x, predict(model), lty = 2, col = "red", lwd = 3)

9.5

y=c(3624.1,3962.9,4124.2,4330.6,4623.1,5080.2,5977.3,6836.3,7305.4,7983.2,8385.9,8049.7,8564.3,9653.5,11179.9,12673,13786.9,14724.3,15782.8,16840.6,17861.6,18975.9,20604.7,22256.0,24247)
K=c(1377.9,1446.7,1451.5,1408.1,1536.9,1716.4,2057.7,2582.2,2754,2884.3,3086.8,2907.5,2975.4,3356.8,4044.2,5487.9,5679,6012,6246.5,6436,6736.1,7098.9,7510.5,8567.3,9764.9)
L=c(40152,41024,42361,43725,45295,46436,48197,49873,51282,52783,54334,55329,64749,65491,66152,66808,67455,68065,68950,69820,70637,71394,72085,73025,73740)
x=seq(1,length(y),1)
plot(x,y)
# 非线性最小二乘法
model=nls(y~a*L^b*K^c,start=list(a=1,b=0.5,c=1)) 
cor(y, predict(model)) 
lines(x, predict(model), lty = 2, col = "red", lwd = 3)