[scikit-learn] Differences between scikit-learn and Spark.ml for regression toy problem

Sun Mar 12 22:07:22 EDT 2017

(this was also posted to stackoverflow on 03/10)

I am setting up a very simple logistic regression problem in scikit-learn
and in spark.ml, and the results diverge: the models they learn are
different, but I can't figure out why (data is the same, model type is the
same, regularization is the same...).

No doubt I am missing some setting on one side or the other. Which setting?
How should I set up either scikit or spark.ml to find the same model as its
counterpart?

I give the sklearn code and spark.ml code below. Both should be ready to
cut-and-paste and run.

scikit-learn code:
----------------------

    import numpy as np
    from sklearn.linear_model import LogisticRegression, Ridge

    X = np.array([
        [-0.7306653538519616, 0.0],
        [0.6750417712898752, -0.4232874171873786],
        [0.1863463229359709, -0.8163423997075965],
        [-0.6719842051493347, 0.0],
        [0.9699938346531928, 0.0],
        [0.22759406190283604, 0.0],
        [0.9688721028330911, 0.0],
        [0.5993795346650845, 0.0],
        [0.9219423508390701, -0.8972778242305388],
        [0.7006904841584055, -0.5607635619919824]
    ])

    y = np.array([
        0.0,
        1.0,
        1.0,
        0.0,
        1.0,
        1.0,
        1.0,
        0.0,
        0.0,
        0.0
    ])

    m, n = X.shape

    # Add intercept term to simulate inputs to GameEstimator
    X_with_intercept = np.hstack((X, np.ones(m)[:,np.newaxis]))

    l = 0.3
    e = LogisticRegression(
        fit_intercept=False,
        penalty='l2',
        C=1/l,
        max_iter=100,
        tol=1e-11)

    e.fit(X_with_intercept, y)

    print e.coef_
    # => [[ 0.98662189  0.45571052 -0.23467255]]

    # Linear regression is called Ridge in sklearn
    e = Ridge(
        fit_intercept=False,
        alpha=l,
        max_iter=100,
        tol=1e-11)

    e.fit(X_with_intercept, y)

    print e.coef_
    # =>[ 0.32155545  0.17904355  0.41222418]

spark.ml code:
-------------------

    import org.apache.spark.{SparkConf, SparkContext}
    import org.apache.spark.ml.classification.LogisticRegression
    import org.apache.spark.ml.regression.LinearRegression
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.sql.SQLContext

    object TestSparkRegression {
      def main(args: Array[String]): Unit = {
        import org.apache.log4j.{Level, Logger}

        Logger.getLogger("org").setLevel(Level.OFF)
        Logger.getLogger("akka").setLevel(Level.OFF)

        val conf = new SparkConf().setAppName("test").setMaster("local")
        val sc = new SparkContext(conf)

        val sparkTrainingData = new SQLContext(sc)
          .createDataFrame(Seq(
            LabeledPoint(0.0, Vectors.dense(-0.7306653538519616, 0.0)),
            LabeledPoint(1.0, Vectors.dense(0.6750417712898752,
-0.4232874171873786)),
            LabeledPoint(1.0, Vectors.dense(0.1863463229359709,
-0.8163423997075965)),
            LabeledPoint(0.0, Vectors.dense(-0.6719842051493347, 0.0)),
            LabeledPoint(1.0, Vectors.dense(0.9699938346531928, 0.0)),
            LabeledPoint(1.0, Vectors.dense(0.22759406190283604, 0.0)),
            LabeledPoint(1.0, Vectors.dense(0.9688721028330911, 0.0)),
            LabeledPoint(0.0, Vectors.dense(0.5993795346650845, 0.0)),
            LabeledPoint(0.0, Vectors.dense(0.9219423508390701,
-0.8972778242305388)),
            LabeledPoint(0.0, Vectors.dense(0.7006904841584055,
-0.5607635619919824))))
          .toDF("label", "features")

        val logisticModel = new LogisticRegression()
          .setRegParam(0.3)
          .setLabelCol("label")
          .setFeaturesCol("features")
          .fit(sparkTrainingData)

        println(s"Spark logistic model coefficients:
${logisticModel.coefficients} Intercept: ${logisticModel.intercept}")
        // Spark logistic model coefficients:
[0.5451588538376263,0.26740606573584713] Intercept: -0.13897955358689987

        val linearModel = new LinearRegression()
          .setRegParam(0.3)
          .setLabelCol("label")
          .setFeaturesCol("features")
          .setSolver("l-bfgs")
          .fit(sparkTrainingData)

        println(s"Spark linear model coefficients:
${linearModel.coefficients} Intercept: ${linearModel.intercept}")
        // Spark linear model coefficients:
[0.19852664861346023,0.11501200541407802] Intercept: 0.45464906876832323

        sc.stop()
      }
    }

Thanks,

Frank
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/scikit-learn/attachments/20170312/f9ca0b42/attachment-0001.html>