【白板推导系列笔记】数学基础-概率-高斯分布-极大似然估计&极大似然估计-有偏VS无偏

121 阅读1分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

关于最大似然估计法,我们有以下直观想法:现在已经取到样本值x1,x2,,xnx_{1},x_{2},\cdots,x_{n}了,这表明取到这一样本值的概率L(θ)L(\theta)比较大。我们当然不会考虑那些不能使样本x1,x2,,xnx_{1},x_{2},\cdots,x_{n}出现的θΘ\theta \in \Theta作为θ\theta的估计,再者,如果已知当θ=θ0Θ\theta=\theta_{0}\in \Theta时使L(θ)L(\theta)取很大值,而Θ\Theta中的其他值使L(θ)L(\theta)取很小值,我自自然认为取θ0\theta_{0}作为未知参数θ\theta的估计值较为合理

来源:《概率论与数理统计》高等教育出版社-P152

 

Data:X=(x1,x2,,xN)T=(x1Tx2T xNT)N×p,xiRp,xiiidN(μ,Σ)MLE:θMLE=argmaxθP(Xθ),θ=(μ,Σ) \begin{gathered} \text{Data}:X=(x_{1},x_{2},\cdots,x_{N})^{T}=\begin{pmatrix} x_{1}^{T} \\ x_{2}^{T} \\ \vdots  \\ x_{N}^{T} \end{pmatrix}_{N \times p},x_{i} \in \mathbb{R}^{p},x_{i}\overset{\text{iid}}{\sim }N(\mu,\Sigma )\\ \text{MLE}:\theta_{\text{MLE}}=\mathop{argmax}\limits_{\theta}P(X|\theta),\theta=(\mu,\Sigma ) \end{gathered}

p=1,θ=(μ,σ2)p=1,\theta=(\mu,\sigma^{2})

p(x)=12πσexp((xμ)22σ2)p(x)=1(2π)π2Σ12exp(12(xμ)TΣ1(xμ)) \begin{aligned} p(x)&=\frac{1}{\sqrt{2\pi}\sigma}\text{exp}\left(-\frac{(x-\mu)^{2}}{2\sigma^{2}}\right)\\ p(x)&=\frac{1}{(2\pi)^{\frac{\pi}{2}}\left|\Sigma \right|^{\frac{1}{2}}}\text{exp}\left(- \frac{1}{2}(x-\mu )^{T}\Sigma ^{-1}(x-\mu)\right) \end{aligned}

这里先讨论一维的情况

log P(Xθ)=log i=1Np(xiθ)=i=1Nlog p(xiθ)=i=1Nlog 12πσexp((xμ)22σ2)=i=1N[log 12π+log 1σ(xiμ)22σ2] \begin{aligned} \text{log }P(X|\theta)&=\text{log }\prod\limits_{i=1}^{N}p(x_{i}|\theta)\\ &=\sum\limits_{i=1}^{N}\text{log }p(x_{i}\theta)\\ &=\sum\limits_{i=1}^{N}\text{log } \frac{1}{\sqrt{2\pi}\sigma}\text{exp}\left(- \frac{(x-\mu)^{2}}{2\sigma^{2}}\right)\\ &=\sum\limits_{i=1}^{N}\left[\text{log } \frac{1}{\sqrt{2\pi}}+\text{log } \frac{1}{\sigma}- \frac{(x_{i}-\mu)^{2}}{2\sigma^{2}}\right] \end{aligned}

对于μMLE\mu_\text{MLE}

μMLE=argmaxμlogP(Xθ)=argmaxμi=1N(xiμ)22σ2=argminμi=1N(xiμ)2μi=1N(xiμ)2=i=1N2(xiμ)(1)i=1N2(xiμ)(1)=0i=1N(xiμ)=0μMLE=1Ni=1Nxi \begin{aligned} \mu_\text{MLE}&=\mathop{argmax}\limits_{\mu}\log P(X|\theta)\\ &=\mathop{argmax}\limits_{\mu}\sum\limits_{i=1}^{N}- \frac{(x_{i}-\mu)^{2}}{2\sigma^{2}}\\ &=\mathop{argmin}\limits_{\mu}\sum\limits_{i=1}^{N}(x_{i}-\mu)^{2}\\ \frac{\partial }{\partial \mu}\sum\limits_{i=1}^{N}(x_{i}-\mu)^{2}&=\sum\limits_{i=1}^{N}2(x_{i}-\mu)(-1)\\ \sum\limits_{i=1}^{N}2(x_{i}-\mu)(-1)&=0\\ \sum\limits_{i=1}^{N}(x_{i}-\mu)&=0\\ \mu_\text{MLE} &=\frac{1}{N}\sum\limits_{i=1}^{N}x_{i} \end{aligned}

对于σMLE2\sigma^{2}_\text{MLE}

σMLE2=argmaxσP(Xθ)=argmaxσ(logσ12σ2(xiμ)2)σ(logσ12σ2(xiμ)2)=i=1N[1σ+12(xiσ)2(2)σ3]i=1N[1σ+12(xiσ)2(2)σ3]=0i=1Nσ2+i=1N(xiμ)2=0i=1Nσ2=i=1N(xiμ)2σMLE2=1Ni=1N(xiμMLE)2 \begin{aligned} \sigma^{2}_\text{MLE}&=\mathop{argmax}\limits_{\sigma}P(X|\theta)\\ &=\mathop{argmax}\limits_{\sigma}\left(- \log \sigma- \frac{1}{2\sigma^{2}}(x_{i}-\mu)^{2}\right)\\ \frac{\partial }{\partial \sigma}\left(- \log \sigma- \frac{1}{2\sigma^{2}}(x_{i}-\mu)^{2}\right)&=\sum\limits_{i=1}^{N}\left[- \frac{1}{\sigma}+ \frac{1}{2}(x_{i}-\sigma)^{2}(-2)\sigma^{-3}\right]\\ \sum\limits_{i=1}^{N}\left[- \frac{1}{\sigma}+ \frac{1}{2}(x_{i}-\sigma)^{2}(-2)\sigma^{-3}\right]&=0\\ -\sum\limits_{i=1}^{N}\sigma^{2}+\sum\limits_{i=1}^{N}(x_{i}-\mu)^{2}&=0\\ \sum\limits_{i=1}^{N}\sigma^{2}&=\sum\limits_{i=1}^{N}(x_{i}-\mu)^{2}\\ \sigma^{2}_\text{MLE}&=\frac{1}{N}\sum\limits_{i=1}^{N}(x_{i}-\mu_\text{MLE})^{2} \end{aligned}

实际上,μMLE\mu_\text{MLE}是无偏估计,σMLE2\sigma^{2}_\text{MLE}是有偏估计

对于μMLE\mu_\text{MLE}

E(μMLE)=1Ni=1NE(xi)=1Ni=1Nμ=μ \begin{aligned} E(\mu_\text{MLE})&=\frac{1}{N}\sum\limits_{i=1}^{N}E(x_{i})\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}\mu\\ &=\mu\\ \end{aligned}

对于σMLE2\sigma^{2}_\text{MLE}

σMLE2=1Ni=1N(xiμMLE)2=1Ni=1N(xi22xiμMLE+μMLE2)=1Ni=1Nxi22μMLE2+μMLE2=1Ni=1Nxi2μMLE2E(σMLE2)=E(1Ni=1Nxi2μMLE2)=E[(1Ni=1Nxi2μ2)(μMLE2μ2)]=E(1Ni=1Nxi2μ2)E(μMLE2μ2)=1Ni=1NE(xi2μ2)[E(μMLE2)E(μ2)]=1Ni=1N[E(xi2)E(μ2)][E(μMLE2)E(μ2)]=1Ni=1N[E(xi2)μ2][E(μMLE2)μ2]=1Ni=1N[E(xi2)E(xi)2][E(μMLE2)E(μMLE)2]=1Ni=1NVar(xi)Var(μMLE)=1Ni=1Nσ2σ2N=N1Nσ2 \begin{aligned} \sigma_\text{MLE}^{2}&=\frac{1}{N}\sum\limits_{i=1}^{N}(x_{i}-\mu_\text{MLE})^{2}\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}(x_{i}^{2}-2x_{i}\mu_\text{MLE}+\mu_\text{MLE}^{2})\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}x_{i}^{2}-2\cdot \mu_\text{MLE}^{2}+\mu_\text{MLE}^{2}\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}x_{i}^{2}-\mu_\text{MLE}^{2}\\ E(\sigma_\text{MLE}^{2})&=E\left(\frac{1}{N}\sum\limits_{i=1}^{N}x_{i}^{2}-\mu_\text{MLE}^{2}\right)\\ &=E\left[ \left(\frac{1}{N}\sum\limits_{i=1}^{N}x_{i}^{2}-\mu^{2}\right)-(\mu_\text{MLE}^{2}-\mu^{2})\right]\\ &=E\left(\frac{1}{N}\sum\limits_{i=1}^{N}x_{i}^{2}-\mu^{2}\right)-E(\mu_\text{MLE}^{2}-\mu^{2})\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}E(x_{i}^{2}-\mu^{2})-[E(\mu_\text{MLE}^{2})-E(\mu^{2})]\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}[E(x_{i}^{2})-E(\mu^{2})]-[E(\mu_\text{MLE}^{2})-E(\mu^{2})]\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}[E(x_{i}^{2})-\mu^{2}]-[E(\mu_\text{MLE}^{2})-\mu^{2}]\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}[E(x_{i}^{2})-E(x_{i})^{2}]-[E(\mu_\text{MLE}^{2})-E(\mu_\text{MLE})^{2}]\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}\text{Var}(x_{i})-\text{Var}(\mu_\text{MLE})\\ &=\frac{1}{N}\sum\limits_{i=1}^{N}\sigma^{2}- \frac{\sigma^{2}}{N}\\ &=\frac{N-1}{N}\sigma^{2} \end{aligned}