详解线性分类-高斯判别分析(Gaussian Discriminant Analysis)-模型求解(求期望)&模型求解(求协方差)【白板推导系列笔记】

112 阅读1分钟

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第4天,点击查看活动详情

L(μ1,μ2,Σ,ϕ)=i=1N[logN(μ1,Σ)yi(1)+logN(μ2,Σ)1yi(2)+logϕyi(1ϕ)1yi(3)] L(\mu_{1},\mu_{2},\Sigma,\phi)=\sum\limits_{i=1}^{N}[\underbrace{\log N(\mu_{1},\Sigma)^{y_{i}}}_{(1)}+\underbrace{\log N(\mu_{2},\Sigma)^{1-y_{i}}}_{(2)}+\underbrace{\log \phi^{y_{i}}(1-\phi)^{1-y_{i}}}_{(3)}]

ϕ\phi,显然只有(3)(3)ϕ\phi相关

(3)=i=1Nlogϕyi(1ϕ)1yi=i=1N[yilogϕ+(1yi)log(1ϕ)](3)ϕ=i=1N[yi1ϕ(1yi)11ϕ]=00=i=1N[yi(1ϕ)(1yi)ϕ]0=i=1N(yiyiϕϕ+yiϕ)0=i=1N(yiϕ)0=i=1Nyi+Nϕϕ^=i=1NyiN \begin{aligned} (3)&=\sum\limits_{i=1}^{N}\log \phi^{y_{i}}(1-\phi)^{1-y_{i}}\\ &=\sum\limits_{i=1}^{N}[y_{i} \log \phi+(1-y_{i})\log(1-\phi)]\\ \frac{\partial (3)}{\partial \phi}&=\sum\limits_{i=1}^{N}\left[y_{i}\cdot \frac{1}{\phi}-\left(1-y_{i}\right) \frac{1}{1-\phi}\right]=0\\ 0&=\sum\limits_{i=1}^{N}[y_{i}\cdot (1-\phi)-(1-y_{i})\phi]\\ 0&=\sum\limits_{i=1}^{N}(y_{i}-y_{i}\phi-\phi+y_{i}\phi)\\ 0&=\sum\limits_{i=1}^{N}(y_{i}-\phi)\\ 0&=\sum\limits_{i=1}^{N}y_{i}+N \phi\\ \hat{\phi}&= \frac{\sum\limits_{i=1}^{N}y_{i}}{N} \end{aligned}

μ1\mu_{1},显然只有(1)(1)μ1\mu_{1}相关。对于μ2\mu_{2}类似于μ1\mu_{1},只需要1yi1-y_{i}替换yiy_{i}即可

(1)=i=1NlogN(μ1,Σ)yi=i=1Nyilog1(2π)p2Σ12exp[12(xiμ1)TΣ1(xiμ1)]μ1=argmax μ1(1)=argmax μ1i=1Nyi[12(xiμ1)TΣ1(xiμ1)]=argmax μ112i=1Nyi(xiTΣ1μ1TΣ1)(xiμ1)=argmax μ112i=1Nyi(xiTΣ1xiRxiTΣ1μ11×1μ1TΣ1xi1×1+μ1TΣ1μ1)=argmax μ112i=1Nyi(xiTΣ1xi2μ1TΣ1xi+μ1TΣ1μ1)ΔΔμ1=12i=1Nyi(2Σ1xi+2Σ1μ1)=00=i=1Nyi(Σ1μ1Σ1xi)0=i=1Nyi(μ1xi)i=1Nyiμ1=i=1Nyixiμ1^=i=1Nyixii=1Nyi \begin{aligned} (1)&=\sum\limits_{i=1}^{N}\log N(\mu_{1},\Sigma)^{y_{i}}\\ &=\sum\limits_{i=1}^{N}y_{i}\log \frac{1}{(2\pi)^{\frac{p}{2}}|\Sigma|^{\frac{1}{2}}}\text{exp}\left[ - \frac{1}{2}(x_{i}-\mu_{1})^{T}\Sigma^{-1}(x_{i}-\mu_{1})\right]\\ \mu_{1}&=\mathop{argmax\space}\limits_{\mu_{1}}(1)\\ &=\mathop{argmax\space}\limits_{\mu_{1}}\sum\limits_{i=1}^{N}y_{i}\left[ - \frac{1}{2}(x_{i}-\mu_{1})^{T}\Sigma^{-1}(x_{i}-\mu_{1})\right]\\ &=\mathop{argmax\space}\limits_{\mu_{1}}- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(x_{i}^{T}\Sigma^{-1}-\mu_{1}^{T}\Sigma^{-1})(x_{i}-\mu_{1})\\ &=\mathop{argmax\space}\limits_{\mu_{1}}- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(\underbrace{x_{i}^{T}\Sigma^{-1}x_{i}}_{\in \mathbb{R}}-\underbrace{x_{i}^{T}\Sigma^{-1}\mu_{1}}_{1 \times 1}-\underbrace{\mu_{1}^{T}\Sigma^{-1}x_{i}}_{1 \times 1}+\mu_{1}^{T}\Sigma^{-1}\mu_{1})\\ &=\mathop{argmax\space}\limits_{\mu_{1}}\underbrace{- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(x_{i}^{T}\Sigma^{-1}x_{i}-2\mu_{1}^{T}\Sigma^{-1}x_{i}+\mu_{1}^{T}\Sigma^{-1}\mu_{1})}_{\Delta }\\ \frac{\partial \Delta }{\partial \mu_{1}}&=- \frac{1}{2}\sum\limits_{i=1}^{N}y_{i}(-2\Sigma^{-1}x_{i}+2\Sigma^{-1}\mu_{1})=0\\ 0&=\sum\limits_{i=1}^{N}y_{i}(\Sigma^{-1}\mu_{1}-\Sigma^{-1}x_{i})\\ 0&=\sum\limits_{i=1}^{N}y_{i}(\mu_{1}-x_{i})\\ \sum\limits_{i=1}^{N}y_{i}\mu_{1}&=\sum\limits_{i=1}^{N}y_{i}x_{i}\\ \hat{\mu_{1}}&=\frac{\sum\limits_{i=1}^{N}y_{i}x_{i}}{\sum\limits_{i=1}^{N}y_{i}} \end{aligned}

这里我们设

C1={xiyi=1,i=1,2,,N},C1=N1C0={xiyi=0,i=1,2,,N},C0=N0N=N1+N0 \begin{aligned} C_{1}&=\left\{x_{i}|y_{i}=1,i=1,2,\cdots,N\right\},|C_{1}|=N_{1}\\ C_{0}&=\left\{x_{i}|y_{i}=0,i=1,2,\cdots,N\right\},|C_{0}|=N_{0}\\ N&=N_{1}+N_{0} \end{aligned}

因此

μ1^=i=1NyixiN1 \hat{\mu_{1}}=\frac{\sum\limits_{i=1}^{N}y_{i}x_{i}}{N_{1}}

再用1yi1-y_{i}替换yiy_{i}μ2^\hat{\mu_{2}}

μ2^=i=1N(1yi)xii=1N(1yi)=i=1N(1yi)xiNN1=i=1N(1yi)xiN0 \hat{\mu_{2}}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{\sum\limits_{i=1}^{N}(1-y_{i})}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{N-N_{1}}=\frac{\sum\limits_{i=1}^{N}(1-y_{i})x_{i}}{N_{0}}

Σ\Sigma,显然只有(1),(2)(1),(2)Σ\Sigma相关

(1)+(2)=i=1NyilogN(μ1,Σ)+i=1N(1yi)logN(μ2,Σ)=xiC1log(μ1,Σ)+xiC2logN(μ2,Σ)i=1NlogN(μ,Σ)=i=1N1(2π)p2Σ12exp[12(xiμ)TΣ1(xiμ)]=i=1N[log1(2π)p2+logΣ12+(12(xiμ)TΣ1(xiμ))]=i=1N[C12logΣ12(xiμ)TΣ1(xiμ)]=C12NlogΣ12i=1N(xiμ)TΣ1(xiμ)Ri=1N(xiμ)TΣ1(xiμ)=i=1Ntr [(xiμ)TΣ1(xiμ)]=i=1Ntr [(xiμ)(xiμ)TΣ1]=tr [i=1N(xiμ)(xiμ)Txi的方差SΣ1]S=1Ni=1N(xiμ)(xiμ)T=N tr (SΣ1)带回i=1NlogN(μ,Σ)i=1NlogN(μ,Σ)=C12NlogΣ12i=1N(xiμ)TΣ1(xiμ)=12NlogΣ12Ntr (SΣ1)+C带回(1)+(2)(1)+(2)=12N1logΣ12Ntr (SΣ1)12N2logΣ12Ntr (S2Σ1)+C=12NlogΣ12Ntr (S2Σ1)12Ntr (SΣ1)+C=12[NlogΣ+N1tr (S1Σ1)+N2tr (S2Σ1)]+C(1)+(2)Σ=12(N1ΣΣΣ1N1S1Σ1Σ1N2S2Σ1Σ1)=0NΣN1S1N2S2=0Σ^=1N(N1S1+N2S2) \begin{aligned} (1)+(2)&=\sum\limits_{i=1}^{N}y_{i}\log N(\mu_{1},\Sigma)+\sum\limits_{i=1}^{N}(1-y_{i})\log N(\mu_{2},\Sigma)\\ &=\sum\limits_{x_{i}\in C_{1}}^{}\log(\mu_{1},\Sigma)+\sum\limits_{x_{i}\in C_{2}}^{}\log N(\mu_{2},\Sigma)\\ \sum\limits_{i=1}^{N}\log N(\mu,\Sigma)&=\sum\limits_{i=1}^{N} \frac{1}{(2\pi)^{\frac{p}{2}}|\Sigma|^{\frac{1}{2}}}\text{exp}\left[- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\right]\\ &=\sum\limits_{i=1}^{N}\left[\log \frac{1}{\left(2\pi\right)^{\frac{p}{2}}}+ \log |\Sigma|^{\frac{1}{2}}+\left(- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}- \mu)\right)\right]\\ &=\sum\limits_{i=1}^{N}\left[C - \frac{1}{2}\log|\Sigma|- \frac{1}{2}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\right]\\ &=C- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}\underbrace{\sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)}_{\in \mathbb{R}}\\ \sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)&=\sum\limits_{i=1}^{N}\text{tr }[(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)]\\ &=\sum\limits_{i=1}^{N}\text{tr }[(x_{i}-\mu)(x_{i}-\mu)^{T}\Sigma^{-1}]\\ &=\text{tr }\left[\underbrace{\sum\limits_{i=1}^{N}(x_{i}-\mu)(x_{i}-\mu)^{T}}_{x_{i}的方差S}\Sigma^{-1}\right]\\ &设S= \frac{1}{N}\sum\limits_{i=1}^{N}(x_{i}-\mu)(x_{i}-\mu)^{T}\\ &=N \cdot  \text{tr }(S \Sigma^{-1})\\ &带回\sum\limits_{i=1}^{N}\log N(\mu,\Sigma)\\ \sum\limits_{i=1}^{N}\log N(\mu,\Sigma)&=C- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}\sum\limits_{i=1}^{N}(x_{i}-\mu)^{T}\Sigma^{-1}(x_{i}-\mu)\\ &=- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})+C\\ &带回(1)+(2)\\ (1)+(2)&=- \frac{1}{2}N_{1}\log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})- \frac{1}{2}N_{2}\log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S_{2}\Sigma^{-1})+C\\ &=- \frac{1}{2}N \log|\Sigma|- \frac{1}{2}N \cdot \text{tr }(S_{2}\Sigma^{-1})- \frac{1}{2}N \cdot \text{tr }(S \cdot \Sigma^{-1})+C \\ &=- \frac{1}{2}[N \log|\Sigma|+ N_{1}\text{tr }(S_{1}\Sigma^{-1})+N_{2}\text{tr }(S_{2}\Sigma^{-1})]+C\\ \frac{\partial (1)+(2)}{\partial \Sigma}&=- \frac{1}{2}(N \cdot \frac{1}{|\Sigma|}|\Sigma|\Sigma^{-1}-N_{1}S_{1}\Sigma^{-1}\Sigma^{-1}-N_{2}S_{2}\Sigma^{-1}\Sigma^{-1})=0\\ N \Sigma-N_{1}S_{1}-N_{2}S_{2}&=0\\ \hat{\Sigma}&=\frac{1}{N}(N_{1}S_{1}+N_{2}S_{2}) \end{aligned}

 

迹的性质

tr (AB)=tr (BA)tr (ABC)=tr (CAB)=tr (BCA)\begin{aligned} \text{tr }(AB)&=\text{tr }(BA)\\\text{tr }(ABC)&=\text{tr }(CAB)=\text{tr }(BCA)\end{aligned}

矩阵求导

$$\begin{aligned} \frac{\partial \text{tr }(AB)}{\partial A}&=B^{-1}\\frac{\partial |A|}{\partial A}&=|A|\cdot A^{T}\end{aligned}$