# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.

#' Spark ML -- PageSplitter
#'
#' 
#' @param boundaryRegex how to split into words
#' @param inputCol The name of the input column
#' @param maximumPageLength the maximum number of characters to be in a page
#' @param minimumPageLength the the minimum number of characters to have on a page in order to preserve work boundaries
#' @param outputCol The name of the output column
#' @export
ml_page_splitter <- function(x, boundaryRegex="\\s", inputCol=NULL, maximumPageLength=5000, minimumPageLength=4500, outputCol=NULL)
{
  if (unfit.model) {
    sc <- x
  } else {
    df <- spark_dataframe(x)
    sc <- spark_connection(df)
  }
  env <- new.env(parent = emptyenv())

  env$model <- "com.microsoft.ml.spark.PageSplitter"
  mod <- invoke_new(sc, env$model)

  mod_parameterized <- mod %>%
    invoke("setBoundaryRegex", boundaryRegex) %>%
    invoke("setInputCol", inputCol) %>%
    invoke("setMaximumPageLength", as.integer(maximumPageLength)) %>%
    invoke("setMinimumPageLength", as.integer(minimumPageLength)) %>%
    invoke("setOutputCol", outputCol)

  transformed <- invoke(mod_parameterized, "transform", df)

  sdf_register(transformed)
}
