# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.

#' Spark ML -- CleanMissingData
#'
#'     Removes missing values from input dataset.
#'     
#'         The following modes are supported:
#'     
#'         - Mean   - replaces missings with the mean of fit column
#'         - Median - replaces missings with approximate median of fit column
#'         - Custom - replaces missings with custom value specified by user
#'     
#'         For mean and median modes, only numeric column types are supported,
#'         specifically:
#'     
#'         - int
#'         - long
#'         - float
#'         - double
#'     
#'         For custom mode, the types above are supported and additionally:
#'     
#'         - str
#'         - bool
#' @param cleaningMode Cleaning mode
#' @param customValue Custom value for replacement
#' @param inputCols The names of the input columns
#' @param outputCols The names of the output columns
#' @export
ml_clean_missing_data <- function(x, cleaningMode="Mean", customValue=NULL, inputCols=NULL, outputCols=NULL, only.model=FALSE)
{
  df <- spark_dataframe(x)
  sc <- spark_connection(df)
  env <- new.env(parent = emptyenv())

  env$model <- "com.microsoft.ml.spark.CleanMissingData"
  mod <- invoke_new(sc, env$model)

  mod_parameterized <- mod %>%
    invoke("setCleaningMode", cleaningMode) %>%
    invoke("setCustomValue", customValue) %>%
    invoke("setInputCols", as.array(inputCols)) %>%
    invoke("setOutputCols", as.array(outputCols))
  mod_model_raw <- mod_parameterized %>%
    invoke("fit", df)

  mod_model <- sparklyr:::new_ml_model(mod_parameterized, mod_model_raw, mod_model_raw)

  if (only.model)
    return(mod_model)

  transformed <- invoke(mod_model$model, "transform", df)

  sdf_register(transformed)
}
