Source code for LightGBMClassifier

# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.


import sys
if sys.version >= '3':
    basestring = str

from pyspark.ml.param.shared import *
from pyspark import keyword_only
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel
from pyspark.ml.common import inherit_doc
from mmlspark.Utils import *

[docs]@inherit_doc class LightGBMClassifier(ComplexParamsMixin, JavaMLReadable, JavaMLWritable, JavaEstimator): """ Trains a LightGBM Binary Classification model, a fast, distributed, high performance gradient boosting framework based on decision tree algorithms. For more information please see here: https://github.com/Microsoft/LightGBM. Args: baggingFraction (double): Bagging fraction (default: 1.0) baggingFreq (int): Bagging frequence (default: 0) baggingSeed (int): Bagging seed (default: 3) defaultListenPort (int): The default listen port on executors, used for testing (default: 12400) featureFraction (double): Feature fraction (default: 1.0) featuresCol (str): features column name (default: features) labelCol (str): label column name (default: label) learningRate (double): Learning rate or shrinkage rate (default: 0.1) maxBin (int): Max bin (default: 255) maxDepth (int): Max depth (default: -1) minSumHessianInLeaf (double): minimal sum hessian in one leaf (default: 0.001) numIterations (int): Number of iterations, LightGBM constructs num_class * num_iterations trees (default: 100) numLeaves (int): Number of leaves (default: 31) parallelism (str): Tree learner parallelism, can be set to data_parallel or voting_parallel (default: data_parallel) predictionCol (str): prediction column name (default: prediction) probabilityCol (str): Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities (default: probability) rawPredictionCol (str): raw prediction (a.k.a. confidence) column name (default: rawPrediction) thresholds (object): Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold """ @keyword_only def __init__(self, baggingFraction=1.0, baggingFreq=0, baggingSeed=3, defaultListenPort=12400, featureFraction=1.0, featuresCol="features", labelCol="label", learningRate=0.1, maxBin=255, maxDepth=-1, minSumHessianInLeaf=0.001, numIterations=100, numLeaves=31, parallelism="data_parallel", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", thresholds=None): super(LightGBMClassifier, self).__init__() self._java_obj = self._new_java_obj("com.microsoft.ml.spark.LightGBMClassifier") self.baggingFraction = Param(self, "baggingFraction", "baggingFraction: Bagging fraction (default: 1.0)") self._setDefault(baggingFraction=1.0) self.baggingFreq = Param(self, "baggingFreq", "baggingFreq: Bagging frequence (default: 0)") self._setDefault(baggingFreq=0) self.baggingSeed = Param(self, "baggingSeed", "baggingSeed: Bagging seed (default: 3)") self._setDefault(baggingSeed=3) self.defaultListenPort = Param(self, "defaultListenPort", "defaultListenPort: The default listen port on executors, used for testing (default: 12400)") self._setDefault(defaultListenPort=12400) self.featureFraction = Param(self, "featureFraction", "featureFraction: Feature fraction (default: 1.0)") self._setDefault(featureFraction=1.0) self.featuresCol = Param(self, "featuresCol", "featuresCol: features column name (default: features)") self._setDefault(featuresCol="features") self.labelCol = Param(self, "labelCol", "labelCol: label column name (default: label)") self._setDefault(labelCol="label") self.learningRate = Param(self, "learningRate", "learningRate: Learning rate or shrinkage rate (default: 0.1)") self._setDefault(learningRate=0.1) self.maxBin = Param(self, "maxBin", "maxBin: Max bin (default: 255)") self._setDefault(maxBin=255) self.maxDepth = Param(self, "maxDepth", "maxDepth: Max depth (default: -1)") self._setDefault(maxDepth=-1) self.minSumHessianInLeaf = Param(self, "minSumHessianInLeaf", "minSumHessianInLeaf: minimal sum hessian in one leaf (default: 0.001)") self._setDefault(minSumHessianInLeaf=0.001) self.numIterations = Param(self, "numIterations", "numIterations: Number of iterations, LightGBM constructs num_class * num_iterations trees (default: 100)") self._setDefault(numIterations=100) self.numLeaves = Param(self, "numLeaves", "numLeaves: Number of leaves (default: 31)") self._setDefault(numLeaves=31) self.parallelism = Param(self, "parallelism", "parallelism: Tree learner parallelism, can be set to data_parallel or voting_parallel (default: data_parallel)") self._setDefault(parallelism="data_parallel") self.predictionCol = Param(self, "predictionCol", "predictionCol: prediction column name (default: prediction)") self._setDefault(predictionCol="prediction") self.probabilityCol = Param(self, "probabilityCol", "probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities (default: probability)") self._setDefault(probabilityCol="probability") self.rawPredictionCol = Param(self, "rawPredictionCol", "rawPredictionCol: raw prediction (a.k.a. confidence) column name (default: rawPrediction)") self._setDefault(rawPredictionCol="rawPrediction") self.thresholds = Param(self, "thresholds", "thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold") if hasattr(self, "_input_kwargs"): kwargs = self._input_kwargs else: kwargs = self.__init__._input_kwargs self.setParams(**kwargs)
[docs] @keyword_only def setParams(self, baggingFraction=1.0, baggingFreq=0, baggingSeed=3, defaultListenPort=12400, featureFraction=1.0, featuresCol="features", labelCol="label", learningRate=0.1, maxBin=255, maxDepth=-1, minSumHessianInLeaf=0.001, numIterations=100, numLeaves=31, parallelism="data_parallel", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", thresholds=None): """ Set the (keyword only) parameters Args: baggingFraction (double): Bagging fraction (default: 1.0) baggingFreq (int): Bagging frequence (default: 0) baggingSeed (int): Bagging seed (default: 3) defaultListenPort (int): The default listen port on executors, used for testing (default: 12400) featureFraction (double): Feature fraction (default: 1.0) featuresCol (str): features column name (default: features) labelCol (str): label column name (default: label) learningRate (double): Learning rate or shrinkage rate (default: 0.1) maxBin (int): Max bin (default: 255) maxDepth (int): Max depth (default: -1) minSumHessianInLeaf (double): minimal sum hessian in one leaf (default: 0.001) numIterations (int): Number of iterations, LightGBM constructs num_class * num_iterations trees (default: 100) numLeaves (int): Number of leaves (default: 31) parallelism (str): Tree learner parallelism, can be set to data_parallel or voting_parallel (default: data_parallel) predictionCol (str): prediction column name (default: prediction) probabilityCol (str): Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities (default: probability) rawPredictionCol (str): raw prediction (a.k.a. confidence) column name (default: rawPrediction) thresholds (object): Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold """ if hasattr(self, "_input_kwargs"): kwargs = self._input_kwargs else: kwargs = self.__init__._input_kwargs return self._set(**kwargs)
[docs] def setBaggingFraction(self, value): """ Args: baggingFraction (double): Bagging fraction (default: 1.0) """ self._set(baggingFraction=value) return self
[docs] def getBaggingFraction(self): """ Returns: double: Bagging fraction (default: 1.0) """ return self.getOrDefault(self.baggingFraction)
[docs] def setBaggingFreq(self, value): """ Args: baggingFreq (int): Bagging frequence (default: 0) """ self._set(baggingFreq=value) return self
[docs] def getBaggingFreq(self): """ Returns: int: Bagging frequence (default: 0) """ return self.getOrDefault(self.baggingFreq)
[docs] def setBaggingSeed(self, value): """ Args: baggingSeed (int): Bagging seed (default: 3) """ self._set(baggingSeed=value) return self
[docs] def getBaggingSeed(self): """ Returns: int: Bagging seed (default: 3) """ return self.getOrDefault(self.baggingSeed)
[docs] def setDefaultListenPort(self, value): """ Args: defaultListenPort (int): The default listen port on executors, used for testing (default: 12400) """ self._set(defaultListenPort=value) return self
[docs] def getDefaultListenPort(self): """ Returns: int: The default listen port on executors, used for testing (default: 12400) """ return self.getOrDefault(self.defaultListenPort)
[docs] def setFeatureFraction(self, value): """ Args: featureFraction (double): Feature fraction (default: 1.0) """ self._set(featureFraction=value) return self
[docs] def getFeatureFraction(self): """ Returns: double: Feature fraction (default: 1.0) """ return self.getOrDefault(self.featureFraction)
[docs] def setFeaturesCol(self, value): """ Args: featuresCol (str): features column name (default: features) """ self._set(featuresCol=value) return self
[docs] def getFeaturesCol(self): """ Returns: str: features column name (default: features) """ return self.getOrDefault(self.featuresCol)
[docs] def setLabelCol(self, value): """ Args: labelCol (str): label column name (default: label) """ self._set(labelCol=value) return self
[docs] def getLabelCol(self): """ Returns: str: label column name (default: label) """ return self.getOrDefault(self.labelCol)
[docs] def setLearningRate(self, value): """ Args: learningRate (double): Learning rate or shrinkage rate (default: 0.1) """ self._set(learningRate=value) return self
[docs] def getLearningRate(self): """ Returns: double: Learning rate or shrinkage rate (default: 0.1) """ return self.getOrDefault(self.learningRate)
[docs] def setMaxBin(self, value): """ Args: maxBin (int): Max bin (default: 255) """ self._set(maxBin=value) return self
[docs] def getMaxBin(self): """ Returns: int: Max bin (default: 255) """ return self.getOrDefault(self.maxBin)
[docs] def setMaxDepth(self, value): """ Args: maxDepth (int): Max depth (default: -1) """ self._set(maxDepth=value) return self
[docs] def getMaxDepth(self): """ Returns: int: Max depth (default: -1) """ return self.getOrDefault(self.maxDepth)
[docs] def setMinSumHessianInLeaf(self, value): """ Args: minSumHessianInLeaf (double): minimal sum hessian in one leaf (default: 0.001) """ self._set(minSumHessianInLeaf=value) return self
[docs] def getMinSumHessianInLeaf(self): """ Returns: double: minimal sum hessian in one leaf (default: 0.001) """ return self.getOrDefault(self.minSumHessianInLeaf)
[docs] def setNumIterations(self, value): """ Args: numIterations (int): Number of iterations, LightGBM constructs num_class * num_iterations trees (default: 100) """ self._set(numIterations=value) return self
[docs] def getNumIterations(self): """ Returns: int: Number of iterations, LightGBM constructs num_class * num_iterations trees (default: 100) """ return self.getOrDefault(self.numIterations)
[docs] def setNumLeaves(self, value): """ Args: numLeaves (int): Number of leaves (default: 31) """ self._set(numLeaves=value) return self
[docs] def getNumLeaves(self): """ Returns: int: Number of leaves (default: 31) """ return self.getOrDefault(self.numLeaves)
[docs] def setParallelism(self, value): """ Args: parallelism (str): Tree learner parallelism, can be set to data_parallel or voting_parallel (default: data_parallel) """ self._set(parallelism=value) return self
[docs] def getParallelism(self): """ Returns: str: Tree learner parallelism, can be set to data_parallel or voting_parallel (default: data_parallel) """ return self.getOrDefault(self.parallelism)
[docs] def setPredictionCol(self, value): """ Args: predictionCol (str): prediction column name (default: prediction) """ self._set(predictionCol=value) return self
[docs] def getPredictionCol(self): """ Returns: str: prediction column name (default: prediction) """ return self.getOrDefault(self.predictionCol)
[docs] def setProbabilityCol(self, value): """ Args: probabilityCol (str): Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities (default: probability) """ self._set(probabilityCol=value) return self
[docs] def getProbabilityCol(self): """ Returns: str: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities (default: probability) """ return self.getOrDefault(self.probabilityCol)
[docs] def setRawPredictionCol(self, value): """ Args: rawPredictionCol (str): raw prediction (a.k.a. confidence) column name (default: rawPrediction) """ self._set(rawPredictionCol=value) return self
[docs] def getRawPredictionCol(self): """ Returns: str: raw prediction (a.k.a. confidence) column name (default: rawPrediction) """ return self.getOrDefault(self.rawPredictionCol)
[docs] def setThresholds(self, value): """ Args: thresholds (object): Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold """ self._set(thresholds=value) return self
[docs] def getThresholds(self): """ Returns: object: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold """ return self.getOrDefault(self.thresholds)
[docs] @classmethod def read(cls): """ Returns an MLReader instance for this class. """ return JavaMMLReader(cls)
[docs] @staticmethod def getJavaPackage(): """ Returns package name String. """ return "com.microsoft.ml.spark.LightGBMClassifier"
@staticmethod def _from_java(java_stage): module_name=LightGBMClassifier.__module__ module_name=module_name.rsplit(".", 1)[0] + ".LightGBMClassifier" return from_java(java_stage, module_name) def _create_model(self, java_model): return M(java_model)
[docs]class M(ComplexParamsMixin, JavaModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`LightGBMClassifier`. This class is left empty on purpose. All necessary methods are exposed through inheritance. """
[docs] @classmethod def read(cls): """ Returns an MLReader instance for this class. """ return JavaMMLReader(cls)
[docs] @staticmethod def getJavaPackage(): """ Returns package name String. """ return "M"
@staticmethod def _from_java(java_stage): module_name=M.__module__ module_name=module_name.rsplit(".", 1)[0] + ".M" return from_java(java_stage, module_name)