% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/imputer_linear_regression.R
\name{impute_with_linear_regression}
\alias{impute_with_linear_regression}
\title{Linear Regression Imputation function}
\usage{
impute_with_linear_regression(
  sc,
  sdf,
  target_col,
  feature_cols,
  elastic_net_param = 0,
  target_col_prev
)
}
\arguments{
\item{sc}{A Spark connection}

\item{sdf}{A Spark DataFrame}

\item{target_col}{The column with missing values to impute}

\item{feature_cols}{The columns to use as features in the linear regression model. These columns should not have missing values.}

\item{elastic_net_param}{The elastic net parameter for the linear regression model. Default is 0 (ridge regression)}

\item{target_col_prev}{the target column at the previous iteration. Used to calculate residuals.}
}
\value{
The Spark DataFrame with missing values imputed in the target column
}
\description{
This function imputes missing values in a Spark DataFrame using linear regression.
}
\examples{
# This example is not executed since it needs additional software (Apache Spark)
\dontrun{
# Create a simple dataset with missing values
library(bigMICE)
library(sparklyr)
library(dplyr)

# Connect to Spark
# Assumes that you have already installed Spark with sparklyr::spark_install()
sc <- spark_connect(master = "local")

# Create sample data with some missing values in 'age'
sample_data <- data.frame(
 age = c(25, NA, 35, NA, 45, 30),
 income = c(50000, 60000, 70000, 55000, 80000, 52000),
 education_years = c(16, 18, 20, 17, 22, 16),
 experience = c(3, 8, 12, 5, 18, 7)
)

# Copy to Spark DataFrame
sdf <- copy_to(sc, sample_data, "sample_data")

# Create previous iteration data (for residual calculation)
# In practice, this would be from a previous imputation step
sdf_prev <- sdf \%>\%
  mutate(age = ifelse(is.na(age), 30, age)) \%>\%  # Simple initial imputation
  select(age)

# Impute missing age values using income, education_years, and experience
imputed_sdf <- impute_with_linear_regression(
  sc = sc,
  sdf = sdf,
  target_col = "age",
  feature_cols = c("income", "education_years", "experience"),
  elastic_net_param = 0,  # Ridge regression
  target_col_prev = sdf_prev
)

# View results
imputed_sdf \%>\% collect()

# Clean up
spark_disconnect(sc)
}
}
