Source code for EnsembleByKey

# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.


import sys
if sys.version >= '3':
    basestring = str

from pyspark.ml.param.shared import *
from pyspark import keyword_only
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel
from pyspark.ml.common import inherit_doc
from mmlspark.Utils import *

[docs]@inherit_doc
class EnsembleByKey(ComplexParamsMixin, JavaMLReadable, JavaMLWritable, JavaTransformer):
    """
    The ``EnsembleByKey`` first performs a grouping operation on a set of keys,
    and then averages the selected columns. It can handle scalar or vector columns,
    and the dimensions of the vector columns are automatically inferred by materializing
    the first row of the column. To avoid materialization you can provide the vector dimensions
    through the ``setVectorDims`` function, which takes a mapping from
    columns (String) to dimension (Int). You can also choose to squash or keep the original
    dataset with the ``collapseGroup`` parameter.

    Args:

        colNames (list): Names of the result of each col
        collapseGroup (bool): Whether to collapse all items in group to one entry (default: true)
        cols (list): Cols to ensemble
        keys (list): Keys to group by
        strategy (str): How to ensemble the scores, ex: mean (default: mean)
        vectorDims (dict): the dimensions of any vector columns, used to avoid materialization
    """

    @keyword_only
    def __init__(self, colNames=None, collapseGroup=True, cols=None, keys=None, strategy="mean", vectorDims=None):
        super(EnsembleByKey, self).__init__()
        self._java_obj = self._new_java_obj("com.microsoft.ml.spark.EnsembleByKey")
        self.colNames = Param(self, "colNames", "colNames: Names of the result of each col")
        self.collapseGroup = Param(self, "collapseGroup", "collapseGroup: Whether to collapse all items in group to one entry (default: true)")
        self._setDefault(collapseGroup=True)
        self.cols = Param(self, "cols", "cols: Cols to ensemble")
        self.keys = Param(self, "keys", "keys: Keys to group by")
        self.strategy = Param(self, "strategy", "strategy: How to ensemble the scores, ex: mean (default: mean)")
        self._setDefault(strategy="mean")
        self.vectorDims = Param(self, "vectorDims", "vectorDims: the dimensions of any vector columns, used to avoid materialization")
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        self.setParams(**kwargs)

[docs]    @keyword_only
    def setParams(self, colNames=None, collapseGroup=True, cols=None, keys=None, strategy="mean", vectorDims=None):
        """
        Set the (keyword only) parameters

        Args:

            colNames (list): Names of the result of each col
            collapseGroup (bool): Whether to collapse all items in group to one entry (default: true)
            cols (list): Cols to ensemble
            keys (list): Keys to group by
            strategy (str): How to ensemble the scores, ex: mean (default: mean)
            vectorDims (dict): the dimensions of any vector columns, used to avoid materialization
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)

[docs]    def setColNames(self, value):
        """

        Args:

            colNames (list): Names of the result of each col

        """
        self._set(colNames=value)
        return self


[docs]    def getColNames(self):
        """

        Returns:

            list: Names of the result of each col
        """
        return self.getOrDefault(self.colNames)


[docs]    def setCollapseGroup(self, value):
        """

        Args:

            collapseGroup (bool): Whether to collapse all items in group to one entry (default: true)

        """
        self._set(collapseGroup=value)
        return self


[docs]    def getCollapseGroup(self):
        """

        Returns:

            bool: Whether to collapse all items in group to one entry (default: true)
        """
        return self.getOrDefault(self.collapseGroup)


[docs]    def setCols(self, value):
        """

        Args:

            cols (list): Cols to ensemble

        """
        self._set(cols=value)
        return self


[docs]    def getCols(self):
        """

        Returns:

            list: Cols to ensemble
        """
        return self.getOrDefault(self.cols)


[docs]    def setKeys(self, value):
        """

        Args:

            keys (list): Keys to group by

        """
        self._set(keys=value)
        return self


[docs]    def getKeys(self):
        """

        Returns:

            list: Keys to group by
        """
        return self.getOrDefault(self.keys)


[docs]    def setStrategy(self, value):
        """

        Args:

            strategy (str): How to ensemble the scores, ex: mean (default: mean)

        """
        self._set(strategy=value)
        return self


[docs]    def getStrategy(self):
        """

        Returns:

            str: How to ensemble the scores, ex: mean (default: mean)
        """
        return self.getOrDefault(self.strategy)


[docs]    def setVectorDims(self, value):
        """

        Args:

            vectorDims (dict): the dimensions of any vector columns, used to avoid materialization

        """
        self._set(vectorDims=value)
        return self


[docs]    def getVectorDims(self):
        """

        Returns:

            dict: the dimensions of any vector columns, used to avoid materialization
        """
        return self.getOrDefault(self.vectorDims)



[docs]    @classmethod
    def read(cls):
        """ Returns an MLReader instance for this class. """
        return JavaMMLReader(cls)

[docs]    @staticmethod
    def getJavaPackage():
        """ Returns package name String. """
        return "com.microsoft.ml.spark.EnsembleByKey"

    @staticmethod
    def _from_java(java_stage):
        module_name=EnsembleByKey.__module__
        module_name=module_name.rsplit(".", 1)[0] + ".EnsembleByKey"
        return from_java(java_stage, module_name)