picnicml · Helw150 · Sep 27, 2019 · Oct 8, 2019 · Oct 8, 2019
diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -0,0 +1,21 @@
+version = 2.0.0
+maxColumn = 120
+align = none
+
+continuationIndent {
+  callSite = 2
+  defnSite = 2
+}
+
+newlines {
+  afterImplicitKWInVerticalMultiline = true
+  beforeImplicitKWInVerticalMultiline = true
+  sometimesBeforeColonInMethodReturnType = true
+}
+
+project.git = false
+
+rewrite {
+  rules = [PreferCurlyFors, RedundantBraces, RedundantParens, SortImports, SortModifiers]
+  redundantBraces.maxLines = 1
+}
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -2,3 +2,4 @@ addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
 addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
 addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1")
 addSbtPlugin("com.codacy" % "sbt-codacy-coverage" % "3.0.3")
+addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.4")
diff --git a/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala b/src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala
@@ -17,13 +17,14 @@ object CsvLoader {
     val lines = bufferedSource.getLines()
     val featureIndex = inferFeatureIndex(lines)
 
-    val data = if (featureIndex.types.contains(CategoricalFeature))
-      loadWithMixedFeatures(lines.toList, na, featureIndex)
-    else
-      loadWithNumericalFeatures(lines, na, featureIndex)
+    val data =
+      if (featureIndex.types.contains(CategoricalFeature))
+        loadWithMixedFeatures(lines.toList, na, featureIndex)
+      else
+        loadWithNumericalFeatures(lines, na, featureIndex)
 
     bufferedSource.close()
-    (DenseMatrix(ArraySeq.unsafeWrapArray(data):_*), featureIndex)
+    (DenseMatrix(ArraySeq.unsafeWrapArray(data): _*), featureIndex)
   }
 
   private def inferFeatureIndex(lines: Iterator[String]): FeatureIndex = {
@@ -33,36 +34,48 @@ object CsvLoader {
 
     if (!lines.hasNext)
       throw new IllegalArgumentException("File has a missing header line: feature types")
-    val featureTypes = lines.next().split(",").map(x => removeQuotes(x)).map {
-      case x if x == NumericalFeature.headerLineString => NumericalFeature
-      case x if x == CategoricalFeature.headerLineString => CategoricalFeature
-      case _ => throw new IllegalArgumentException("File contains invalid feature type encoding (header line)")
-    }.toList
+    val featureTypes = lines
+      .next()
+      .split(",")
+      .map(x => removeQuotes(x))
+      .map {
+        case x if x == NumericalFeature.headerLineString => NumericalFeature
+        case x if x == CategoricalFeature.headerLineString => CategoricalFeature
+        case _ => throw new IllegalArgumentException("File contains invalid feature type encoding (header line)")
+      }
+      .toList
 
     FeatureIndex(featureNames, featureTypes, featureNames.indices.toList)
   }
 
-  private def loadWithNumericalFeatures(lines: Iterator[String],
-                                        na: String,
-                                        featureIndex: FeatureIndex): Array[Array[Double]] = {
-    lines.map(_.split(",").map { featureValue =>
-      val trimmedValue = removeQuotes(featureValue)
-      if (trimmedValue == na) Double.NaN else parseDouble(trimmedValue)
-    }).toArray
+  private def loadWithNumericalFeatures(
+    lines: Iterator[String],
+    na: String,
+    featureIndex: FeatureIndex
+  ): Array[Array[Double]] = {
+    lines
+      .map(_.split(",").map { featureValue =>
+        val trimmedValue = removeQuotes(featureValue)
+        if (trimmedValue == na) Double.NaN else parseDouble(trimmedValue)
+      })
+      .toArray
   }
 
-  private def loadWithMixedFeatures(lines: List[String],
-                                    na: String,
-                                    featureIndex: FeatureIndex): Array[Array[Double]] = {
+  private def loadWithMixedFeatures(
+    lines: List[String],
+    na: String,
+    featureIndex: FeatureIndex
+  ): Array[Array[Double]] = {
     val labelEncoder = inferLabelEncoder(lines, na, featureIndex)
     lines.map { rowValues =>
-      rowValues.split(",").zipWithIndex.map { case (featureValue, columnIndex) =>
-        val trimmedValue = removeQuotes(featureValue)
-        featureIndex.types(columnIndex) match {
-          case _ if trimmedValue == na => Double.NaN
-          case NumericalFeature => parseDouble(trimmedValue)
-          case CategoricalFeature => labelEncoder.encode(trimmedValue, featureIndex.names(columnIndex))
-        }
+      rowValues.split(",").zipWithIndex.map {
+        case (featureValue, columnIndex) =>
+          val trimmedValue = removeQuotes(featureValue)
+          featureIndex.types(columnIndex) match {
+            case _ if trimmedValue == na => Double.NaN
+            case NumericalFeature => parseDouble(trimmedValue)
+            case CategoricalFeature => labelEncoder.encode(trimmedValue, featureIndex.names(columnIndex))
+          }
       }
     }.toArray
   }
@@ -71,14 +84,17 @@ object CsvLoader {
   private def inferLabelEncoder(lines: List[String], na: String, featureIndex: FeatureIndex): LabelEncoder = {
     val encoder = mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]()
     val categoricalFeatures = featureIndex.categorical
-    categoricalFeatures.names.foreach { name => encoder(name) = mutable.AnyRefMap[String, Double]() }
+    categoricalFeatures.names.foreach { name =>
+      encoder(name) = mutable.AnyRefMap[String, Double]()
+    }
 
     lines.foreach { rowValues =>
       val rowValuesArray = rowValues.split(",").map(x => removeQuotes(x))
-      categoricalFeatures.columnIndices.zip(categoricalFeatures.names).foreach { case (columnIndex, name) =>
-        val featureValue = rowValuesArray(columnIndex)
-        if (featureValue != na && !encoder(name).contains(featureValue))
-          encoder(name)(featureValue) = encoder(name).size.toDouble
+      categoricalFeatures.columnIndices.zip(categoricalFeatures.names).foreach {
+        case (columnIndex, name) =>
+          val featureValue = rowValuesArray(columnIndex)
+          if (featureValue != na && !encoder(name).contains(featureValue))
+            encoder(name)(featureValue) = encoder(name).size.toDouble
       }
     }
 
@@ -90,16 +106,15 @@ object CsvLoader {
     *
     * @param encoder a map containing mapping of categorical values to numerical values for all categorical features
     */
-  private class LabelEncoder(private val encoder:  mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]) {
+  private class LabelEncoder(private val encoder: mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]) {
     def encode(featureValue: String, featureName: String): Double = encoder(featureName)(featureValue)
   }
 
   private def removeQuotes(s: String): String =
     s.replaceAll("\"", "").replaceAll("'", "")
 
   private def parseDouble(featureValue: String): Double = {
-    try
-      featureValue.toDouble
+    try featureValue.toDouble
     catch {
       case _: NumberFormatException =>
         throw new IllegalArgumentException(

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala b/src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala
@@ -21,16 +21,20 @@ object DatasetUtils {
   }
 
   /** Splits the dataset into two subsets for training and testing and makes sure groups in each are non-overlapping. */
-  def splitDatasetWithGroups(x: Features,
-                             y: Target,
-                             groups: IntVector,
-                             proportionTrain: Double = 0.5): GroupTrainTestSplit = {
+  def splitDatasetWithGroups(
+    x: Features,
+    y: Target,
+    groups: IntVector,
+    proportionTrain: Double = 0.5
+  ): GroupTrainTestSplit = {
     val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain)
     val numSamplesPerGroup = hist(groups, numberOfUniqueGroups(groups)).hist.toArray
     val (sortedNumSamplesPerGroup, toOriginalGroupIndex) = numSamplesPerGroup.zipWithIndex.sorted.unzip
 
     val numGroupsInTrain = sortedNumSamplesPerGroup
-      .foldLeft(List(0)) { case (acc, currGroupSize) => (acc(0) + currGroupSize) :: acc }.reverse.drop(1)
+      .foldLeft(List(0)) { case (acc, currGroupSize) => (acc(0) + currGroupSize) :: acc }
+      .reverse
+      .drop(1)
       .takeWhile(cumulativeNumSamples => cumulativeNumSamples <= numTrain)
       .length
 

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala b/src/main/scala/io/picnicml/doddlemodel/data/Feature.scala
@@ -19,9 +19,11 @@ object Feature {
   }
 
   @SerialVersionUID(0L)
-  class FeatureIndex(val names: IndexedSeq[String],
-                     val types: IndexedSeq[FeatureType],
-                     val columnIndices: IndexedSeq[Int]) extends Serializable {
+  class FeatureIndex(
+    val names: IndexedSeq[String],
+    val types: IndexedSeq[FeatureType],
+    val columnIndices: IndexedSeq[Int]
+  ) extends Serializable {
 
     def categorical: FeatureIndex = onlyFeaturesOfType[CategoricalFeature.type]
 
@@ -32,15 +34,15 @@ object Feature {
       val subsetIndices = this.types.zipWithIndex.flatMap {
         case (t, i) => if (cls.isInstance(t)) i.some else none[Int]
       }
-      subset(subsetIndices:_*)
+      subset(subsetIndices: _*)
     }
 
     def subset(names: String*): FeatureIndex = {
       val nameToIndex = this.names.zipWithIndex.toMap
-      subset(names.map(n => nameToIndex(n)):_*)
+      subset(names.map(n => nameToIndex(n)): _*)
     }
 
-    def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*)
+    def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices: _*)
 
     // DummyImplicit is needed to avoid the same type as String* after erasure
     def subset(indices: Int*)(implicit di: DummyImplicit): FeatureIndex = new FeatureIndex(
@@ -52,8 +54,9 @@ object Feature {
     def drop(index: Int): FeatureIndex = new FeatureIndex(
       this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] },
       this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] },
-      this.columnIndices.zipWithIndex.flatMap { case (ci, i) =>
-        if (i == index) none[Int] else if (i > index) (ci - 1).some else ci.some
+      this.columnIndices.zipWithIndex.flatMap {
+        case (ci, i) =>
+          if (i == index) none[Int] else if (i > index) (ci - 1).some else ci.some
       }
     )
 

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/ResourceDatasetLoaders.scala b/src/main/scala/io/picnicml/doddlemodel/data/ResourceDatasetLoaders.scala
@@ -39,12 +39,13 @@ object ResourceDatasetLoaders {
 
   private def getBufferedSourceFromResource(path: String): BufferedSource = {
     val resourceUrl = getClass.getResource(path)
-    val file = if (resourceUrl.toString.startsWith("jar:"))
-      // reads file from JAR
-      readResourceFileWithinJar(path)
-    else
-      // reads file when using IDE
-      new File(resourceUrl.getFile)
+    val file =
+      if (resourceUrl.toString.startsWith("jar:"))
+        // reads file from JAR
+        readResourceFileWithinJar(path)
+      else
+        // reads file when using IDE
+        new File(resourceUrl.getFile)
     if (file != null && !file.exists)
       throw new RuntimeException(s"Error: File $file not found!")
     Source.fromFile(file)

diff --git a/src/main/scala/io/picnicml/doddlemodel/data/data.scala b/src/main/scala/io/picnicml/doddlemodel/data/data.scala
@@ -2,9 +2,11 @@ package io.picnicml.doddlemodel.data
 
 case class TrainTestSplit(xTr: Features, yTr: Target, xTe: Features, yTe: Target)
 
-case class GroupTrainTestSplit(xTr: Features,
-                               yTr: Target,
-                               groupsTr: IntVector,
-                               xTe: Features,
-                               yTe: Target,
-                               groupsTe: IntVector)
+case class GroupTrainTestSplit(
+  xTr: Features,
+  yTr: Target,
+  groupsTr: IntVector,
+  xTe: Features,
+  yTe: Target,
+  groupsTe: IntVector
+)
diff --git a/src/main/scala/io/picnicml/doddlemodel/data/package.scala b/src/main/scala/io/picnicml/doddlemodel/data/package.scala
@@ -1,6 +1,6 @@
 package io.picnicml.doddlemodel
 
-import breeze.linalg.{DenseMatrix, DenseVector, unique}
+import breeze.linalg.{unique, DenseMatrix, DenseVector}
 import io.picnicml.doddlemodel.CrossScalaCompat.doubleOrdering
 import io.picnicml.doddlemodel.data.Feature.FeatureIndex
 
@@ -24,17 +24,20 @@ package object data {
 
   def numberOfUniqueGroups(groups: IntVector): Int = {
     val uniqueGroups = unique(groups)
-    require(uniqueGroups.toArray.sorted sameElements Array.range(0, uniqueGroups.length),
-      "Invalid encoding of groups, all group indices in [0, numGroups) have to exist")
+    require(
+      uniqueGroups.toArray.sorted sameElements Array.range(0, uniqueGroups.length),
+      "Invalid encoding of groups, all group indices in [0, numGroups) have to exist"
+    )
     uniqueGroups.length
   }
 
   def numberOfTargetClasses(y: Target): Int = {
     val targetClasses = unique(y)
-    require(targetClasses.length >= 2,
-      "Target variable must be comprised of at least two categories")
-    require(targetClasses.toArray.sorted sameElements Array.range(0, targetClasses.length),
-      "Invalid encoding of categories in the target variable")
+    require(targetClasses.length >= 2, "Target variable must be comprised of at least two categories")
+    require(
+      targetClasses.toArray.sorted sameElements Array.range(0, targetClasses.length),
+      "Invalid encoding of categories in the target variable"
+    )
     targetClasses.length
   }
 }
diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/MostFrequentClassifier.scala
@@ -28,9 +28,14 @@ object MostFrequentClassifier {
       model.copy(numClasses = numClasses.some)
 
     override protected def fitSafe(model: MostFrequentClassifier, x: Features, y: Target): MostFrequentClassifier = {
-      val mostFrequentClass = y.activeValuesIterator.foldLeft(Map[Double, Int]()) { (acc, x) =>
-        if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
-      }.toArray.sortBy(_._1).maxBy(_._2)._1
+      val mostFrequentClass = y.activeValuesIterator
+        .foldLeft(Map[Double, Int]()) { (acc, x) =>
+          if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
+        }
+        .toArray
+        .sortBy(_._1)
+        .maxBy(_._2)
+        ._1
 
       model.copy(mostFrequentClass = mostFrequentClass.some)
     }

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/StratifiedClassifier.scala
@@ -37,9 +37,13 @@ object StratifiedClassifier {
       model.copy(numClasses = numClasses.some)
 
     override protected def fitSafe(model: StratifiedClassifier, x: Features, y: Target): StratifiedClassifier = {
-      val probs = y.activeValuesIterator.foldLeft(Map[Double, Int]()) { (acc, x) =>
-        if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
-      }.toArray.sortBy(_._1).map(_._2 / y.length.toDouble)
+      val probs = y.activeValuesIterator
+        .foldLeft(Map[Double, Int]()) { (acc, x) =>
+          if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
+        }
+        .toArray
+        .sortBy(_._1)
+        .map(_._2 / y.length.toDouble)
 
       model.copy(targetDistr = Multinomial[RealVector, Int](DenseVector(probs)).some)
     }

diff --git a/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala b/src/main/scala/io/picnicml/doddlemodel/dummy/classification/UniformClassifier.scala
@@ -1,6 +1,6 @@
 package io.picnicml.doddlemodel.dummy.classification
 
-import breeze.linalg.{DenseVector, convert}
+import breeze.linalg.{convert, DenseVector}
 import breeze.stats.distributions.Rand
 import cats.syntax.option._
 import io.picnicml.doddlemodel.data.{Features, Simplex, Target}

diff --git a/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala b/src/main/scala/io/picnicml/doddlemodel/impute/MeanValueImputer.scala
@@ -17,8 +17,10 @@ import io.picnicml.doddlemodel.typeclasses.Transformer
   * val imputer = MeanValueImputer(featureIndex)
   * val imputerSubsetOfColumns = MeanValueImputer(featureIndex.subset("f0", "f2"))
   */
-case class MeanValueImputer private (private[impute] val means: Option[RealVector],
-                                     private val featureIndex: FeatureIndex)
+case class MeanValueImputer private (
+  private[impute] val means: Option[RealVector],
+  private val featureIndex: FeatureIndex
+)
 
 object MeanValueImputer {
 
@@ -40,10 +42,11 @@ object MeanValueImputer {
 
     override protected def transformSafe(model: MeanValueImputer, x: Features): Features = {
       val xCopy = x.copy
-      model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { case (colIndex, statisticIndex) =>
-        xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex =>
-          xCopy(rowIndex, colIndex) = model.means.getOrBreak(statisticIndex)
-        }
+      model.featureIndex.numerical.columnIndices.zipWithIndex.foreach {
+        case (colIndex, statisticIndex) =>
+          xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex =>
+            xCopy(rowIndex, colIndex) = model.means.getOrBreak(statisticIndex)
+          }
       }
       xCopy
     }