<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question 10-fold cross validation in Random Forests in Archives of Support Questions (Read Only)</title>
    <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135994#M39689</link>
    <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I am using &lt;A href="http://spark.apache.org/docs/latest/mllib-ensembles.html"&gt;this Scala code&lt;/A&gt; of MLlib about random forests. I wonder if this code uses 10-fold cross validation. If not, I would like to know how to do it in Scala.&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Laia&lt;/P&gt;</description>
    <pubDate>Mon, 05 Sep 2016 16:32:00 GMT</pubDate>
    <dc:creator>laia_subirats</dc:creator>
    <dc:date>2016-09-05T16:32:00Z</dc:date>
    <item>
      <title>10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135994#M39689</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I am using &lt;A href="http://spark.apache.org/docs/latest/mllib-ensembles.html"&gt;this Scala code&lt;/A&gt; of MLlib about random forests. I wonder if this code uses 10-fold cross validation. If not, I would like to know how to do it in Scala.&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Laia&lt;/P&gt;</description>
      <pubDate>Mon, 05 Sep 2016 16:32:00 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135994#M39689</guid>
      <dc:creator>laia_subirats</dc:creator>
      <dc:date>2016-09-05T16:32:00Z</dc:date>
    </item>
    <item>
      <title>Re: 10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135995#M39690</link>
      <description>&lt;P&gt;No, that code is not using cross-validation. An example about how to use cross validation can be found &lt;A href="https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation"&gt;here&lt;/A&gt;. It needs the DataFrame API, so you should refer to &lt;A href="https://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier"&gt;this &lt;/A&gt;for the Random Forest implementation.&lt;/P&gt;</description>
      <pubDate>Mon, 05 Sep 2016 17:38:49 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135995#M39690</guid>
      <dc:creator>mgaido</dc:creator>
      <dc:date>2016-09-05T17:38:49Z</dc:date>
    </item>
    <item>
      <title>Re: 10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135996#M39691</link>
      <description>&lt;P&gt;Hello, &lt;/P&gt;&lt;P&gt;I have the following error: java.lang.IllegalArgumentException: Field "label_idx" does not exist.&lt;/P&gt;&lt;P&gt;After executing this code:&lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.tree.RandomForest &lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.tree.model.RandomForestModel &lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.util.MLUtils &lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.linalg.Vectors &lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.regression.LabeledPoint &lt;/P&gt;&lt;P&gt;import org.apache.spark.mllib.evaluation.MulticlassMetrics &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.classification.RandomForestClassifier &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator &lt;/P&gt;&lt;P&gt;import org.apache.spark.sql.types._
import sqlContext.implicits._ &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.attribute.NominalAttribute &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.feature.StringIndexer &lt;/P&gt;&lt;P&gt;val unparseddata = sc.textFile("hdfs:///tmp/epidemiological16.csv") &lt;/P&gt;&lt;P&gt;val data = unparseddata.map { line =&amp;gt;
  val parts = line.split(',').map(_.toDouble)
  LabeledPoint(parts.last%2, Vectors.dense(parts.slice(0, parts.length - 1)))
} &lt;/P&gt;&lt;P&gt;val splits = data.randomSplit(Array(0.7, 0.3)) &lt;/P&gt;&lt;P&gt;val (trainingData2, testData2) = (splits(0), splits(1)) &lt;/P&gt;&lt;P&gt;val trainingData = trainingData2.toDF &lt;/P&gt;&lt;P&gt;val nFolds: Int = 10 &lt;/P&gt;&lt;P&gt;val NumTrees: Int = 3 &lt;/P&gt;&lt;P&gt;val rf = new RandomForestClassifier()
  .setNumTrees(NumTrees)
  .setFeaturesCol("features") &lt;/P&gt;&lt;P&gt;val indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("label_idx")
  .fit(trainingData) &lt;/P&gt;&lt;P&gt;rf.setLabelCol("label_idx").fit(indexer.transform(trainingData)) &lt;/P&gt;&lt;P&gt;val pipeline = new Pipeline().setStages(Array(rf)) &lt;/P&gt;&lt;P&gt;
val paramGrid = new ParamGridBuilder().build()  &lt;/P&gt;&lt;P&gt;val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction") &lt;/P&gt;&lt;P&gt;val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(evaluator) 
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(nFolds) &lt;/P&gt;&lt;P&gt;val model = cv.fit(trainingData)&lt;/P&gt;&lt;P&gt;Do you know where can be the problem?&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Laia &lt;/P&gt;</description>
      <pubDate>Mon, 05 Sep 2016 19:46:48 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135996#M39691</guid>
      <dc:creator>laia_subirats</dc:creator>
      <dc:date>2016-09-05T19:46:48Z</dc:date>
    </item>
    <item>
      <title>Re: 10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135997#M39692</link>
      <description>&lt;P&gt;Could you please post the full stack trace of the exception? It looks like the indexer is not creating properly the label_idx column...&lt;/P&gt;</description>
      <pubDate>Tue, 06 Sep 2016 14:53:01 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135997#M39692</guid>
      <dc:creator>mgaido</dc:creator>
      <dc:date>2016-09-06T14:53:01Z</dc:date>
    </item>
    <item>
      <title>Re: 10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135998#M39693</link>
      <description>&lt;P&gt;Hello Marco,&lt;/P&gt;&lt;P&gt;Dan already answered the question here &lt;A href="https://community.hortonworks.com/answers/55111/view.html"&gt;https://community.hortonworks.com/answers/55111/view.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Laia&lt;/P&gt;</description>
      <pubDate>Wed, 07 Sep 2016 14:42:28 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135998#M39693</guid>
      <dc:creator>laia_subirats</dc:creator>
      <dc:date>2016-09-07T14:42:28Z</dc:date>
    </item>
    <item>
      <title>Re: 10-fold cross validation in Random Forests</title>
      <link>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135999#M39694</link>
      <description>&lt;P&gt;I would like to perform a 10 CV with random forest on an RDD input. But I am having a problem when converting the RDD input to a DataFrame.
I am using this code as you recommended: &lt;/P&gt;&lt;P&gt;
import org.apache.spark.ml.Pipeline;&lt;/P&gt;&lt;P&gt;
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator};&lt;/P&gt;&lt;P&gt;
import org.apache.spark.ml.classification.RandomForestClassifier; &lt;/P&gt;&lt;P&gt;import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator; &lt;/P&gt;&lt;P&gt;var inputPath  = "..." &lt;/P&gt;&lt;P&gt;var text = sc.textFile(inputPath) &lt;/P&gt;&lt;P&gt;var rows = text.map(line =&amp;gt; line.split(",").map(_.trim)).map(a =&amp;gt; Row.fromSeq(a)) &lt;/P&gt;&lt;P&gt;var header = rows.first() &lt;/P&gt;&lt;P&gt;val schema = StructType(header.map(fieldName =&amp;gt; StructField(fieldName.asInstanceOf[String],StringType,true))) &lt;/P&gt;&lt;P&gt;val df = spark.createDataFrame(rows,schema) &lt;/P&gt;&lt;P&gt;val nFolds: Int = 10 &lt;/P&gt;&lt;P&gt;val NumTrees: Int = 30 &lt;/P&gt;&lt;P&gt;val metric: String = "accuracy" &lt;/P&gt;&lt;P&gt;val rf = new RandomForestClassifier()
  .setLabelCol("label")
  .setFeaturesCol("features")
  .setNumTrees(NumTrees) &lt;/P&gt;&lt;P&gt;val pipeline = new Pipeline().setStages(Array(rf)) 
val paramGrid = new ParamGridBuilder().build() // No parameter search&lt;/P&gt;&lt;P&gt;
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
     .setMetricName(metric) &lt;/P&gt;&lt;P&gt;val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(evaluator) 
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(nFolds) &lt;/P&gt;&lt;P&gt;val model = cv.fit(df) // trainingData: DataFrame &lt;/P&gt;&lt;P&gt;Any help please?
Thank you.&lt;/P&gt;</description>
      <pubDate>Fri, 04 Aug 2017 21:47:16 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Archives-of-Support-Questions/10-fold-cross-validation-in-Random-Forests/m-p/135999#M39694</guid>
      <dc:creator>zeinebchelly</dc:creator>
      <dc:date>2017-08-04T21:47:16Z</dc:date>
    </item>
  </channel>
</rss>

