Ahsan Ijaz
\[ \min_{w_0,w_1}\sum_{i=1}^{N}(y_i - [w_0 + w_{1}x_i])^2 \] Finding minimum of a sample equation with respect to w: \[ y = 50 + (w-10)^2 \]
Minimizing a sample equation: \[y = 50 + (w-10)^2 \]
First Derivative:
\[ \frac{d(y)}{dw} = 2(w-10) \]
library(manipulate)
RSSmin <- function(W_v){
w = seq(0,20)
parabola = data.frame(input = w, parabola = 50 + (w-10)^2, derivative = 2*(w-10))
ggplot(data = parabola, aes(x=input, y=parabola,color = "red")) +
guides(colour=FALSE) +
geom_line(size = 2) +
xlab("W variable") + ylab("Output of equation") +
theme(text = element_text(size=23),
plot.title = element_text(size = rel(1))) +
annotate("text", x = 10, y = 25, label = paste("2*(w-10) =",
as.character(2*(W_v-10))), size = 6) +
annotate("text", x = 10, y = 100, label = paste("RSS = ",
as.character(50+(W_v-10)^2)), size = 6)
}
manipulate(RSSmin(W_v), W_v = slider(-5, 15, step = 0.5))
\[ w_i = w_{i-1} - \zeta\times{}\nabla(\color{blue}{\textit{RSS}}) \]
Remember from before that RSS is given by:
\[\textit{RSS} = \sum_{i=1}^{N}(y_i - [\color{blue}{w_0} + \color{blue}{w_{1}}x_i])^2 \]
Taking derivative w.r.t \(w_0\) :
\[ -2\sum_{i=1}^{N}(y_i - [w_0 + w_{1}x_i]) \]
Taking derivative w.r.t \(w_1\) :
\[ -2\sum_{i=1}^{N}(y_i - [w_0 + w_{1}x_i])x_i \]
gradient <- function(w){
2*(w-10)
}
w_old = 20
zeta = 0.01
i = 0
w_values = c()
while(abs(gradient(w_old))>0.5){
w_new = w_old - (zeta*gradient(w_old))
w_old = w_new
w_values <- c(w_values,w_old)
i = i+1
}
\[ \nabla\textit{RSS}(\color{blue}{w_0},\color{blue}{w_1}) = \begin{bmatrix}-2\sum_{i=1}^{N}(y_i - [\color{blue}{w_0} + \color{blue}{w_{1}}x_i]) \\ -2\sum_{i=1}^{N}(y_i - [\color{blue}{w_0} + \color{blue}{w_{1}}x_i])x_i \end{bmatrix} \]
While not congverged:
\[ \begin{bmatrix}w_0^{(t+1)} \\ w_1^{(t+1)} \end{bmatrix} = \begin{bmatrix}w_0^{(t)} \\ w_1^{(t)} \end{bmatrix} - \zeta\color{blue}{\nabla\textit{RSS}(w_0^{(t)},w_1^{(t)})} \]
Convergence condition?
When \(\nabla\textit{RSS} \leq \epsilon\).
\[ w_i = w_{i-1} - \color{blue}{\zeta}\times{}\nabla(\textit{RSS}) \]
x = seq(0,40)
dat <- data.frame(input = x, output = 14+ 10*x)
prediction <- function(w0,w1,input){
w0+w1*input
}
error <- function(actualValues, predictions){
actualValues - predictions
}
gradientW0 <- function(error){
-2*sum(error)
}
gradientW1 <- function(error,input){
-2*sum(error*input)
}
grad0 = 10
grad1 = 10
w0old = 0
w1old = 0
zeta = 0.0001
while( (abs(grad0)+abs(grad1))>0.5 ){
predictions = prediction(w0old,w1old,dat$input)
errors = error(dat$output, predictions)
grad0 = gradientW0(errors)
grad1 = gradientW1(errors,dat$input)
w0new = w0old - zeta*grad0
w1new = w1old - zeta*grad1
w0old = w0new
w1old = w1new
}
while( (abs(grad0)+abs(grad1))>0.5 ){
for(i in 1:length(dat$input)){
predictions = prediction(w0old,w1old,dat$input[i])
errors = error(dat$output[i], predictions)
grad0 = gradientW0(errors)
grad1 = gradientW1(errors,dat$input[i])
w0new = w0old - zeta*grad0
w1new = w1old - zeta*grad1
w0old = w0new
w1old = w1new
print(paste(w0old,w1old))
}
}