# Setup

## Logistic regression

#' Cost function for Logistic regression, i.e. binomial family in GLM.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#'     the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_binomial <- function(data, family = "binomial") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(
as.matrix(data[, 1:p]), data[, p + 1],
family = family
)
return(out$deviance / 2) } #' Implementation of vanilla PELT for logistic regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) } cp <- cp_set[[n + 1]] nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL
}

output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}

#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#'     averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#'     the Hessian matrix.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#'     coefficients so far and all the Hessian matrices.
cost_logistic_update <- function(
data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- 1 / (1 + exp(-eta))
cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}

#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_binomial <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + log(1 + exp(u))
return(sum(L))
}

#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#'     beginning, ending and between change points.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#'     likelihood for each segment based on the change points guess.
segd_binomial <- function(data, beta, B = 10, trim = 0.025) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)

# choose the initial values based on pre-segmentation

index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(
as.matrix(data[index == i, 1:p]),
data[index == i, p + 1],
family = "binomial"
)
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
e_eta <- exp(coef %*% X1)
const <- e_eta / (1 + e_eta)^2
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))

for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)

for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) {
cval[i] <-
neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1))
}
}

# the choice of initial values requires further investigation

cval[m] <- 0
Xt <- data[t, 1:p]
const <- e_eta_t / (1 + e_eta_t)^2
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)

cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)

# Adding a momentum term (TBD)

obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}

# Remove change-points close to the boundaries

cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)]
cp <- cp[-ind3]
}

nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } ## Poisson regression #' Cost function for Poisson regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_poisson <- function(data, family = "poisson") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family) return(out$deviance / 2)
}

#' Implementation of vanilla PELT for poisson regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#'     likelihood values for each segment.
pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
# if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
output <- list(cp)
names(output) <- c("cp")

return(output)
}

#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#'     averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#'     the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#'     coefficients so far and all the Hessian matrices.
cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
eta <- X_new %*% coef
mu <- exp(eta)
cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G)
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev)
coef <- pmin(pmax(coef, L), H)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}

#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_poisson <- function(data, b) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
u <- as.numeric(X %*% b)
L <- -Y * u + exp(u) + lfactorial(Y)
return(sum(L))
}

#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#'     beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#'    the Hessian matrix.
#' @param G Upper bound for the coefficient.
#' @param L Winsorization lower bound.
#' @param H Winsorization upper bound.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#'     likelihood for each segment based on the change points guess.
segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)

# choose the initial values based on pre-segmentation

index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
for (i in 1:B)
{
out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson")
coef.int[i, ] <- coef(out)
}
X1 <- data[1, 1:p]
cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H)
e_eta <- exp(coef %*% X1)
const <- e_eta
cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1))

for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
cmatrix_c <- cmatrix[, , i]
out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H)
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
cmatrix[, , i] <- out[[3]]
k <- set[i] + 1
cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H)
if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0
}

# the choice of initial values requires further investigation

cval[m] <- 0
Xt <- data[t, 1:p]
const <- e_eta_t
cmatrix_add <- (Xt %o% Xt) * as.numeric(const)
cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3)

# Adding a momentum term (TBD)

obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
Fobj <- c(Fobj, min_val)
}

# Remove change-points close to the boundaries

cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}

cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]

# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
#   seg <- (cp_loc[i]+1):cp_loc[i+1]
#   data_seg <- data[seg,]
#   out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson")
#   nLL <- out$deviance/2 + nLL # } # output <- list(cp, nLL) # names(output) <- c("cp", "nLL") output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from poisson regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]) group <- rpois(length(mu), mu) y <- c(y, group) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } ## Penalized linear regression #' Cost function for penalized linear regression. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param lambda Penalty coefficient. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_lasso <- function(data, lambda, family = "gaussian") { data <- as.matrix(data) n <- dim(data)[1] p <- dim(data)[2] - 1 out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda) return(deviance(out) / 2) } #' Implementation of vanilla PELT for penalized linear regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 index <- rep(1:B, rep(n / B, B)) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family) # nLL <- deviance(out)/2 + nLL # } # output <- list(cp, nLL) output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' @param a Coefficient to be updated. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Updated coefficient. soft_threshold <- function(a, lambda) { sign(a) * pmax(abs(a) - lambda, 0) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] mu <- X_new %*% coef cmatrix <- cmatrix + X_new %o% X_new # B <- as.vector(cmatrix_inv%*%X_new) # cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B)) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix, lik_dev) nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm. coef <- soft_threshold(coef, lambda / nc) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @param lambda Penalty coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_lasso <- function(data, b, lambda) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] resi <- Y - X %*% b L <- sum(resi^2) / 2 + lambda * sum(abs(b)) return(L) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) err_sd <- act_num <- rep(NA, B) for (i in 1:B) { cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family) coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1] resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ]) err_sd[i] <- sqrt(mean(resi^2)) act_num[i] <- sum(abs(coef.int[i, ]) > 0) } err_sd_mean <- mean(err_sd) # only works if error sd is unchanged. act_num_mean <- mean(act_num) beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) eta <- coef %*% X1 # c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon) # cmatrix_inv <- array(c_int, c(p,p,1)) cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] # cmatrix_inv_c <- cmatrix_inv[,,i] cmatrix_c <- cmatrix[, , i] k <- set[i] + 1 out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] # cmatrix_inv[,,i] <- out[[3]] cmatrix[, , i] <- out[[3]] if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] # cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) # cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3) cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] # cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries and merge change-points cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial") # nLL <- out$deviance/2 + nLL
# }

output <- list(cp)
names(output) <- c("cp")
return(output)
}

# Generate data from penalized linear regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @param evar Error variance.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#'    assignments.
data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE]
add <- Xb + rnorm(length(Xb), sd = sqrt(evar))
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}

# Logistic regression

set.seed(1)
p <- 5
x <- matrix(rnorm(300 * p, 0, 1), ncol = p)

# Randomly generate coefficients with different means.
theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))

# Randomly generate response variables based on the segmented data and
# corresponding coefficients
y <- c(
rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))),
rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ])))
)

segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp #> [1] 125 fastcpd.binomial( cbind(y, x), segment_count = 5, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 125 pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp
#> [1]   0 125

fastcpd.binomial(
cbind(y, x),
segment_count = 5,
vanilla_percentage = 1,
beta = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125

# Poisson regression

set.seed(1)
n <- 1500
d <- 5
rho <- 0.9
Sigma <- array(0, c(d, d))
for (i in 1:d) {
Sigma[i, ] <- rho^(abs(i - (1:d)))
}
delta <- c(5, 7, 9, 11, 13)
a.sq <- 1
delta.new <-
delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta))
true.cp.loc <- c(375, 750, 1125)

# regression coefficients
true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1)
true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2)
true.coef[, 2] <- true.coef[, 1] + delta.new
true.coef[, 3] <- true.coef[, 1]
true.coef[, 4] <- true.coef[, 3] - delta.new

out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma)
data <- out[[1]]
g_tr <- out[[2]]
beta <- log(n) * (d + 1) / 2

segd_poisson(
data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20
)$cp #> [1] 380 751 1136 1251 fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), beta = beta, cost_adjustment = "BIC", epsilon = 0.001, segment_count = 10, r.progress = FALSE )@cp_set #> [1] 380 751 1136 1251 pelt_vanilla_poisson(data, beta)$cp
#> [1]    0  374  752 1133

fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
segment_count = 10,
vanilla_percentage = 1,
beta = beta,
r.progress = FALSE
)@cp_set
#> [1]  374  752 1133

# Penalized linear regression

set.seed(1)
n <- 1000
s <- 3
d <- 50
evar <- 0.5
Sigma <- diag(1, d)
true.cp.loc <- c(100, 300, 500, 800, 900)
seg <- length(true.cp.loc) + 1
true.coef <- matrix(rnorm(seg * s), s, seg)
true.coef <- rbind(true.coef, matrix(0, d - s, seg))
out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar)
data <- out[[1]]
beta <- log(n) / 2 # beta here has different meaning

segd_lasso(data, beta, B = 10, trim = 0.025)$cp #> [1] 100 300 520 800 901 fastcpd.lasso( cbind(data[, d + 1], data[, 1:d]), epsilon = 1e-5, beta = beta, cost_adjustment = "BIC", pruning_coef = 0, r.progress = FALSE )@cp_set #> [1] 100 300 520 800 901 pelt_vanilla_lasso(data, beta, cost = cost_lasso)$cp
#> [1] 100
#> [1] 200
#> [1] 300
#> [1] 400
#> [1] 500
#> [1] 600
#> [1] 700
#> [1] 800
#> [1] 900
#> [1] 1000
#> [1]   0 103 299 510 800 900

fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
vanilla_percentage = 1,
epsilon = 1e-5,
beta = beta,
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 103 299 510 800 900

# Notes

The evaluation of this vignette is set to be FALSE.

# Appendix: all code snippets

knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, cache = FALSE, warning = FALSE, fig.width = 8, fig.height = 5 ) library(fastcpd) #' Cost function for Logistic regression, i.e. binomial family in GLM. #' #' @param data Data to be used to calculate the cost values. The last column is #' the response variable. #' @param family Family of the distribution. #' @keywords internal #' #' @noRd #' @return Cost value for the corresponding segment of data. cost_glm_binomial <- function(data, family = "binomial") { data <- as.matrix(data) p <- dim(data)[2] - 1 out <- fastglm::fastglm( as.matrix(data[, 1:p]), data[, p + 1], family = family ) return(out$deviance / 2)
}

#' Implementation of vanilla PELT for logistic regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#'     likelihood values for each segment.
pelt_vanilla_binomial <- function(data, beta, cost = cost_glm_binomial) {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
cval[i] <- 0
if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ]))
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
}
cp <- cp_set[[n + 1]]
nLL <- 0
cp_loc <- unique(c(0, cp, n))
for (i in 1:(length(cp_loc) - 1))
{
seg <- (cp_loc[i] + 1):cp_loc[i + 1]
data_seg <- data[seg, ]
out <- fastglm::fastglm(
as.matrix(data_seg[, 1:p]), data_seg[, p + 1],
family = "binomial"
)
nLL <- out$deviance / 2 + nLL } output <- list(cp, nLL) names(output) <- c("cp", "nLL") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_logistic_update <- function( data_new, coef, cum_coef, cmatrix, epsilon = 1e-10) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- 1 / (1 + exp(-eta)) cmatrix <- cmatrix + (X_new %o% X_new) * as.numeric((1 - mu) * mu) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_binomial <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + log(1 + exp(u)) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_binomial <- function(data, beta, B = 10, trim = 0.025) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm( as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = "binomial" ) coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- matrix(coef.int[1, ], p, 1) e_eta <- exp(coef %*% X1) const <- e_eta / (1 + e_eta)^2 cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_logistic_update(data[t, ], coef_c, cum_coef_c, cmatrix_c) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cval[i] <- 0 if (t - k >= p - 1) { cval[i] <- neg_log_lik_binomial(data[k:t, ], cum_coef[, i] / (t - k + 1)) } } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- coef.int[index[t], ] e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t / (1 + e_eta_t)^2 cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (seq_len(length(cp)))[(cp < trim * n) | (cp > (1 - trim) * n)] cp <- cp[-ind3] } nLL <- 0 cp_loc <- unique(c(0, cp, n)) for (i in 1:(length(cp_loc) - 1)) { seg <- (cp_loc[i] + 1):cp_loc[i + 1] data_seg <- data[seg, ] out <- fastglm::fastglm( as.matrix(data_seg[, 1:p]), data_seg[, p + 1], family = "binomial" ) nLL <- out$deviance / 2 + nLL
}

output <- list(cp, nLL)
names(output) <- c("cp", "nLL")
return(output)
}
#' Cost function for Poisson regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#'     the response variable.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_glm_poisson <- function(data, family = "poisson") {
data <- as.matrix(data)
p <- dim(data)[2] - 1
out <- fastglm::fastglm(as.matrix(data[, 1:p]), data[, p + 1], family = family)
return(out$deviance / 2) } #' Implementation of vanilla PELT for poisson regression type data. #' #' @param data Data to be used for change point detection. #' @param beta Penalty coefficient for the number of change points. #' @param cost Cost function to be used to calculate cost values. #' @keywords internal #' #' @noRd #' @return A list consisting of two: change point locations and negative log #' likelihood values for each segment. pelt_vanilla_poisson <- function(data, beta, cost = cost_glm_poisson) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:m) { k <- set[i] + 1 if (t - k >= p - 1) cval[i] <- suppressWarnings(cost(data[k:t, ])) else cval[i] <- 0 } obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) Fobj <- c(Fobj, min_val) # if (t %% 100 == 0) print(t) } cp <- cp_set[[n + 1]] output <- list(cp) names(output) <- c("cp") return(output) } #' Function to update the coefficients using gradient descent. #' #' @param data_new New data point used to update the coeffient. #' @param coef Previous coeffient to be updated. #' @param cum_coef Summation of all the past coefficients to be used in #' averaging. #' @param cmatrix Hessian matrix in gradient descent. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list of values containing the new coefficients, summation of #' coefficients so far and all the Hessian matrices. cost_poisson_update <- function(data_new, coef, cum_coef, cmatrix, epsilon = 0.001, G = 10^10, L = -20, H = 20) { p <- length(data_new) - 1 X_new <- data_new[1:p] Y_new <- data_new[p + 1] eta <- X_new %*% coef mu <- exp(eta) cmatrix <- cmatrix + (X_new %o% X_new) * min(as.numeric(mu), G) lik_dev <- as.numeric(-(Y_new - mu)) * X_new coef <- coef - solve(cmatrix + epsilon * diag(1, p), lik_dev) coef <- pmin(pmax(coef, L), H) cum_coef <- cum_coef + coef return(list(coef, cum_coef, cmatrix)) } #' Calculate negative log likelihood given data segment and guess of #' coefficient. #' #' @param data Data to be used to calculate the negative log likelihood. #' @param b Guess of the coefficient. #' @keywords internal #' #' @noRd #' @return Negative log likelihood. neg_log_lik_poisson <- function(data, b) { p <- dim(data)[2] - 1 X <- data[, 1:p, drop = FALSE] Y <- data[, p + 1, drop = FALSE] u <- as.numeric(X %*% b) L <- -Y * u + exp(u) + lfactorial(Y) return(sum(L)) } #' Find change points using dynamic programming with pruning and SeGD. #' #' @param data Data used to find change points. #' @param beta Penalty coefficient for the number of change points. #' @param B Initial guess on the number of change points. #' @param trim Propotion of the data to ignore the change points at the #' beginning, ending and between change points. #' @param epsilon Small adjustment to avoid singularity when doing inverse on #' the Hessian matrix. #' @param G Upper bound for the coefficient. #' @param L Winsorization lower bound. #' @param H Winsorization upper bound. #' @keywords internal #' #' @noRd #' @return A list containing potential change point locations and negative log #' likelihood for each segment based on the change points guess. segd_poisson <- function(data, beta, B = 10, trim = 0.03, epsilon = 0.001, G = 10^10, L = -20, H = 20) { n <- dim(data)[1] p <- dim(data)[2] - 1 Fobj <- c(-beta, 0) cp_set <- list(NULL, 0) set <- c(0, 1) # choose the initial values based on pre-segmentation index <- rep(1:B, rep(n / B, B)) coef.int <- matrix(NA, B, p) for (i in 1:B) { out <- fastglm::fastglm(x = as.matrix(data[index == i, 1:p]), y = data[index == i, p + 1], family = "poisson") coef.int[i, ] <- coef(out) } X1 <- data[1, 1:p] cum_coef <- coef <- pmin(pmax(matrix(coef.int[1, ], p, 1), L), H) e_eta <- exp(coef %*% X1) const <- e_eta cmatrix <- array((X1 %o% X1) * as.numeric(const), c(p, p, 1)) for (t in 2:n) { m <- length(set) cval <- rep(NA, m) for (i in 1:(m - 1)) { coef_c <- coef[, i] cum_coef_c <- cum_coef[, i] cmatrix_c <- cmatrix[, , i] out <- cost_poisson_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, epsilon = epsilon, G = G, L = L, H = H) coef[, i] <- out[[1]] cum_coef[, i] <- out[[2]] cmatrix[, , i] <- out[[3]] k <- set[i] + 1 cum_coef_win <- pmin(pmax(cum_coef[, i] / (t - k + 1), L), H) if (t - k >= p - 1) cval[i] <- neg_log_lik_poisson(data[k:t, ], cum_coef_win) else cval[i] <- 0 } # the choice of initial values requires further investigation cval[m] <- 0 Xt <- data[t, 1:p] cum_coef_add <- coef_add <- pmin(pmax(coef.int[index[t], ], L), H) e_eta_t <- exp(coef_add %*% Xt) const <- e_eta_t cmatrix_add <- (Xt %o% Xt) * as.numeric(const) coef <- cbind(coef, coef_add) cum_coef <- cbind(cum_coef, cum_coef_add) cmatrix <- abind::abind(cmatrix, cmatrix_add, along = 3) # Adding a momentum term (TBD) obj <- cval + Fobj[set + 1] + beta min_val <- min(obj) ind <- which(obj == min_val)[1] cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind]) cp_set <- append(cp_set, list(cp_set_add)) ind2 <- (cval + Fobj[set + 1]) <= min_val set <- c(set[ind2], t) coef <- coef[, ind2, drop = FALSE] cum_coef <- cum_coef[, ind2, drop = FALSE] cmatrix <- cmatrix[, , ind2, drop = FALSE] Fobj <- c(Fobj, min_val) } # Remove change-points close to the boundaries cp <- cp_set[[n + 1]] if (length(cp) > 0) { ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)] if (length(ind3) > 0) cp <- cp[-ind3] } cp <- sort(unique(c(0, cp))) index <- which((diff(cp) < trim * n) == TRUE) if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2) cp <- cp[cp > 0] # nLL <- 0 # cp_loc <- unique(c(0,cp,n)) # for(i in 1:(length(cp_loc)-1)) # { # seg <- (cp_loc[i]+1):cp_loc[i+1] # data_seg <- data[seg,] # out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="Poisson") # nLL <- out$deviance/2 + nLL
# }

# output <- list(cp, nLL)
# names(output) <- c("cp", "nLL")

output <- list(cp)
names(output) <- c("cp")

return(output)
}

# Generate data from poisson regression models with change-points
#' @param n Number of observations.
#' @param d Dimension of the covariates.
#' @param true.coef True regression coefficients.
#' @param true.cp.loc True change-point locations.
#' @param Sigma Covariance matrix of the covariates.
#' @keywords internal
#'
#' @noRd
#' @return A list containing the generated data and the true cluster
#'    assignments.
data_gen_poisson <- function(n, d, true.coef, true.cp.loc, Sigma) {
loc <- unique(c(0, true.cp.loc, n))
if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match")
x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma)
y <- NULL
for (i in 1:(length(loc) - 1))
{
mu <- exp(x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE])
group <- rpois(length(mu), mu)
y <- c(y, group)
}
data <- cbind(x, y)
true_cluster <- rep(1:(length(loc) - 1), diff(loc))
result <- list(data, true_cluster)
return(result)
}
#' Cost function for penalized linear regression.
#'
#' @param data Data to be used to calculate the cost values. The last column is
#'     the response variable.
#' @param lambda Penalty coefficient.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return Cost value for the corresponding segment of data.
cost_lasso <- function(data, lambda, family = "gaussian") {
data <- as.matrix(data)
n <- dim(data)[1]
p <- dim(data)[2] - 1
out <- glmnet::glmnet(as.matrix(data[, 1:p]), data[, p + 1], family = family, lambda = lambda)
return(deviance(out) / 2)
}

#' Implementation of vanilla PELT for penalized linear regression type data.
#'
#' @param data Data to be used for change point detection.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param cost Cost function to be used to calculate cost values.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list consisting of two: change point locations and negative log
#'     likelihood values for each segment.
pelt_vanilla_lasso <- function(data, beta, B = 10, cost = cost_lasso, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
index <- rep(1:B, rep(n / B, B))
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef)
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices

Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)
for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)
for (i in 1:m)
{
k <- set[i] + 1
if (t - k >= 1) cval[i] <- suppressWarnings(cost(data[k:t, ], lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))) else cval[i] <- 0
}
obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
Fobj <- c(Fobj, min_val)
if (t %% 100 == 0) print(t)
}
cp <- cp_set[[n + 1]]
# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
#  seg <- (cp_loc[i]+1):cp_loc[i+1]
#  data_seg <- data[seg,]
#  out <- glmnet(as.matrix(data_seg[, 1:p]), data_seg[, p+1], lambda=lambda, family=family)
#  nLL <- deviance(out)/2 + nLL
# }
# output <- list(cp, nLL)
output <- list(cp)
names(output) <- c("cp")
return(output)
}

#' Function to update the coefficients using gradient descent.
#' @param a Coefficient to be updated.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Updated coefficient.
soft_threshold <- function(a, lambda) {
sign(a) * pmax(abs(a) - lambda, 0)
}

#' Function to update the coefficients using gradient descent.
#'
#' @param data_new New data point used to update the coeffient.
#' @param coef Previous coeffient to be updated.
#' @param cum_coef Summation of all the past coefficients to be used in
#'     averaging.
#' @param cmatrix Hessian matrix in gradient descent.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return A list of values containing the new coefficients, summation of
#'     coefficients so far and all the Hessian matrices.
cost_lasso_update <- function(data_new, coef, cum_coef, cmatrix, lambda) {
p <- length(data_new) - 1
X_new <- data_new[1:p]
Y_new <- data_new[p + 1]
mu <- X_new %*% coef
cmatrix <- cmatrix + X_new %o% X_new
# B <- as.vector(cmatrix_inv%*%X_new)
# cmatrix_inv <- cmatrix_inv - B%o%B/(1+sum(X_new*B))
lik_dev <- as.numeric(-(Y_new - mu)) * X_new
coef <- coef - solve(cmatrix, lik_dev)
nc <- norm(cmatrix, type = "F") # the choice of norm affects the speed. Spectral norm is more accurate but slower than F norm.
coef <- soft_threshold(coef, lambda / nc)
cum_coef <- cum_coef + coef
return(list(coef, cum_coef, cmatrix))
}

#' Calculate negative log likelihood given data segment and guess of
#' coefficient.
#'
#' @param data Data to be used to calculate the negative log likelihood.
#' @param b Guess of the coefficient.
#' @param lambda Penalty coefficient.
#' @keywords internal
#'
#' @noRd
#' @return Negative log likelihood.
neg_log_lik_lasso <- function(data, b, lambda) {
p <- dim(data)[2] - 1
X <- data[, 1:p, drop = FALSE]
Y <- data[, p + 1, drop = FALSE]
resi <- Y - X %*% b
L <- sum(resi^2) / 2 + lambda * sum(abs(b))
return(L)
}

#' Find change points using dynamic programming with pruning and SeGD.
#'
#' @param data Data used to find change points.
#' @param beta Penalty coefficient for the number of change points.
#' @param B Initial guess on the number of change points.
#' @param trim Propotion of the data to ignore the change points at the
#'     beginning, ending and between change points.
#' @param epsilon Small adjustment to avoid singularity when doing inverse on
#'    the Hessian matrix.
#' @param family Family of the distribution.
#' @keywords internal
#'
#' @noRd
#' @return A list containing potential change point locations and negative log
#'     likelihood for each segment based on the change points guess.
segd_lasso <- function(data, beta, B = 10, trim = 0.025, epsilon = 1e-5, family = "gaussian") {
n <- dim(data)[1]
p <- dim(data)[2] - 1
Fobj <- c(-beta, 0)
cp_set <- list(NULL, 0)
set <- c(0, 1)

# choose the initial values based on pre-segmentation

index <- rep(1:B, rep(n / B, B))
coef.int <- matrix(NA, B, p)
err_sd <- act_num <- rep(NA, B)
for (i in 1:B)
{
cvfit <- glmnet::cv.glmnet(as.matrix(data[index == i, 1:p]), data[index == i, p + 1], family = family)
coef.int[i, ] <- coef(cvfit, s = "lambda.1se")[-1]
resi <- data[index == i, p + 1] - as.matrix(data[index == i, 1:p]) %*% as.numeric(coef.int[i, ])
err_sd[i] <- sqrt(mean(resi^2))
act_num[i] <- sum(abs(coef.int[i, ]) > 0)
}
err_sd_mean <- mean(err_sd) # only works if error sd is unchanged.
act_num_mean <- mean(act_num)
beta <- (act_num_mean + 1) * beta # seems to work but there might be better choices

X1 <- data[1, 1:p]
cum_coef <- coef <- matrix(coef.int[1, ], p, 1)
eta <- coef %*% X1
# c_int <- diag(1/epsilon,p) - X1%o%X1/epsilon^2/(1+sum(X1^2)/epsilon)
# cmatrix_inv <- array(c_int, c(p,p,1))
cmatrix <- array(X1 %o% X1 + epsilon * diag(1, p), c(p, p, 1))

for (t in 2:n)
{
m <- length(set)
cval <- rep(NA, m)

for (i in 1:(m - 1))
{
coef_c <- coef[, i]
cum_coef_c <- cum_coef[, i]
# cmatrix_inv_c <- cmatrix_inv[,,i]
cmatrix_c <- cmatrix[, , i]
k <- set[i] + 1
out <- cost_lasso_update(data[t, ], coef_c, cum_coef_c, cmatrix_c, lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1)))
coef[, i] <- out[[1]]
cum_coef[, i] <- out[[2]]
# cmatrix_inv[,,i] <- out[[3]]
cmatrix[, , i] <- out[[3]]
if (t - k >= 2) cval[i] <- neg_log_lik_lasso(data[k:t, ], cum_coef[, i] / (t - k + 1), lambda = err_sd_mean * sqrt(2 * log(p) / (t - k + 1))) else cval[i] <- 0
}

# the choice of initial values requires further investigation

cval[m] <- 0
Xt <- data[t, 1:p]
# cmatrix_inv_add <- diag(1/epsilon,p) - Xt%o%Xt/epsilon^2/(1+sum(Xt^2)/epsilon)

# cmatrix_inv <- abind::abind(cmatrix_inv, cmatrix_inv_add, along=3)
cmatrix <- abind::abind(cmatrix, Xt %o% Xt + epsilon * diag(1, p), along = 3)

obj <- cval + Fobj[set + 1] + beta
min_val <- min(obj)
ind <- which(obj == min_val)[1]
cp_set_add <- c(cp_set[[set[ind] + 1]], set[ind])
ind2 <- (cval + Fobj[set + 1]) <= min_val
set <- c(set[ind2], t)
coef <- coef[, ind2, drop = FALSE]
cum_coef <- cum_coef[, ind2, drop = FALSE]
cmatrix <- cmatrix[, , ind2, drop = FALSE]
# cmatrix_inv <- cmatrix_inv[,,ind2,drop=FALSE]
Fobj <- c(Fobj, min_val)
}

# Remove change-points close to the boundaries and merge change-points

cp <- cp_set[[n + 1]]
if (length(cp) > 0) {
ind3 <- (1:length(cp))[(cp < trim * n) | (cp > (1 - trim) * n)]
if (length(ind3) > 0) cp <- cp[-ind3]
}

cp <- sort(unique(c(0, cp)))
index <- which((diff(cp) < trim * n) == TRUE)
if (length(index) > 0) cp <- floor((cp[-(index + 1)] + cp[-index]) / 2)
cp <- cp[cp > 0]

# nLL <- 0
# cp_loc <- unique(c(0,cp,n))
# for(i in 1:(length(cp_loc)-1))
# {
#  seg <- (cp_loc[i]+1):cp_loc[i+1]
#  data_seg <- data[seg,]
#  out <- fastglm(as.matrix(data_seg[, 1:p]), data_seg[, p+1], family="binomial")
#  nLL <- out$deviance/2 + nLL # } output <- list(cp) names(output) <- c("cp") return(output) } # Generate data from penalized linear regression models with change-points #' @param n Number of observations. #' @param d Dimension of the covariates. #' @param true.coef True regression coefficients. #' @param true.cp.loc True change-point locations. #' @param Sigma Covariance matrix of the covariates. #' @param evar Error variance. #' @keywords internal #' #' @noRd #' @return A list containing the generated data and the true cluster #' assignments. data_gen_lasso <- function(n, d, true.coef, true.cp.loc, Sigma, evar) { loc <- unique(c(0, true.cp.loc, n)) if (dim(true.coef)[2] != length(loc) - 1) stop("true.coef and true.cp.loc do not match") x <- mvtnorm::rmvnorm(n, mean = rep(0, d), sigma = Sigma) y <- NULL for (i in 1:(length(loc) - 1)) { Xb <- x[(loc[i] + 1):loc[i + 1], , drop = FALSE] %*% true.coef[, i, drop = FALSE] add <- Xb + rnorm(length(Xb), sd = sqrt(evar)) y <- c(y, add) } data <- cbind(x, y) true_cluster <- rep(1:(length(loc) - 1), diff(loc)) result <- list(data, true_cluster) return(result) } set.seed(1) p <- 5 x <- matrix(rnorm(300 * p, 0, 1), ncol = p) # Randomly generate coefficients with different means. theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) # Randomly generate response variables based on the segmented data and # corresponding coefficients y <- c( rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), rbinom(300 - 125, 1, 1 / (1 + exp(-x[(125 + 1):300, ] %*% theta[2, ]))) ) segd_binomial(cbind(x, y), (p + 1) * log(300) / 2, B = 5)$cp
#> [1] 125

fastcpd.binomial(
cbind(y, x),
segment_count = 5,
beta = "BIC",
r.progress = FALSE
)@cp_set
#> [1] 125

pelt_vanilla_binomial(cbind(x, y), (p + 1) * log(300) / 2)$cp #> [1] 0 125 fastcpd.binomial( cbind(y, x), segment_count = 5, vanilla_percentage = 1, beta = "BIC", cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 125 set.seed(1) n <- 1500 d <- 5 rho <- 0.9 Sigma <- array(0, c(d, d)) for (i in 1:d) { Sigma[i, ] <- rho^(abs(i - (1:d))) } delta <- c(5, 7, 9, 11, 13) a.sq <- 1 delta.new <- delta * sqrt(a.sq) / sqrt(as.numeric(t(delta) %*% Sigma %*% delta)) true.cp.loc <- c(375, 750, 1125) # regression coefficients true.coef <- matrix(0, nrow = d, ncol = length(true.cp.loc) + 1) true.coef[, 1] <- c(1, 1.2, -1, 0.5, -2) true.coef[, 2] <- true.coef[, 1] + delta.new true.coef[, 3] <- true.coef[, 1] true.coef[, 4] <- true.coef[, 3] - delta.new out <- data_gen_poisson(n, d, true.coef, true.cp.loc, Sigma) data <- out[[1]] g_tr <- out[[2]] beta <- log(n) * (d + 1) / 2 segd_poisson( data, beta, trim = 0.03, B = 10, epsilon = 0.001, G = 10^10, L = -20, H = 20 )$cp
#> [1]  380  751 1136 1251

fastcpd.poisson(
cbind(data[, d + 1], data[, 1:d]),
beta = beta,
epsilon = 0.001,
segment_count = 10,
r.progress = FALSE
)@cp_set
#> [1]  380  751 1136 1251

pelt_vanilla_poisson(data, beta)$cp #> [1] 0 374 752 1133 fastcpd.poisson( cbind(data[, d + 1], data[, 1:d]), segment_count = 10, vanilla_percentage = 1, beta = beta, cost_adjustment = "BIC", r.progress = FALSE )@cp_set #> [1] 374 752 1133 set.seed(1) n <- 1000 s <- 3 d <- 50 evar <- 0.5 Sigma <- diag(1, d) true.cp.loc <- c(100, 300, 500, 800, 900) seg <- length(true.cp.loc) + 1 true.coef <- matrix(rnorm(seg * s), s, seg) true.coef <- rbind(true.coef, matrix(0, d - s, seg)) out <- data_gen_lasso(n, d, true.coef, true.cp.loc, Sigma, evar) data <- out[[1]] beta <- log(n) / 2 # beta here has different meaning segd_lasso(data, beta, B = 10, trim = 0.025)$cp
#> [1] 100 300 520 800 901

fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
epsilon = 1e-5,
beta = beta,
pruning_coef = 0,
r.progress = FALSE
)@cp_set
#> [1] 100 300 520 800 901

pelt_vanilla_lasso(data, beta, cost = cost_lasso)\$cp
#> [1] 100
#> [1] 200
#> [1] 300
#> [1] 400
#> [1] 500
#> [1] 600
#> [1] 700
#> [1] 800
#> [1] 900
#> [1] 1000
#> [1]   0 103 299 510 800 900

fastcpd.lasso(
cbind(data[, d + 1], data[, 1:d]),
vanilla_percentage = 1,
epsilon = 1e-5,
beta = beta,
#> [1] 103 299 510 800 900