Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
version = 2.0.0
maxColumn = 120
align = none

continuationIndent {
callSite = 2
defnSite = 2
}

newlines {
afterImplicitKWInVerticalMultiline = true
beforeImplicitKWInVerticalMultiline = true
sometimesBeforeColonInMethodReturnType = true
}

project.git = false

rewrite {
rules = [PreferCurlyFors, RedundantBraces, RedundantParens, SortImports, SortModifiers]
redundantBraces.maxLines = 1
}
1 change: 1 addition & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8")
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2-1")
addSbtPlugin("com.codacy" % "sbt-codacy-coverage" % "3.0.3")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.0.4")
85 changes: 50 additions & 35 deletions src/main/scala/io/picnicml/doddlemodel/data/CsvLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ object CsvLoader {
val lines = bufferedSource.getLines()
val featureIndex = inferFeatureIndex(lines)

val data = if (featureIndex.types.contains(CategoricalFeature))
loadWithMixedFeatures(lines.toList, na, featureIndex)
else
loadWithNumericalFeatures(lines, na, featureIndex)
val data =
if (featureIndex.types.contains(CategoricalFeature))
loadWithMixedFeatures(lines.toList, na, featureIndex)
else
loadWithNumericalFeatures(lines, na, featureIndex)

bufferedSource.close()
(DenseMatrix(ArraySeq.unsafeWrapArray(data):_*), featureIndex)
(DenseMatrix(ArraySeq.unsafeWrapArray(data): _*), featureIndex)
}

private def inferFeatureIndex(lines: Iterator[String]): FeatureIndex = {
Expand All @@ -33,36 +34,48 @@ object CsvLoader {

if (!lines.hasNext)
throw new IllegalArgumentException("File has a missing header line: feature types")
val featureTypes = lines.next().split(",").map(x => removeQuotes(x)).map {
case x if x == NumericalFeature.headerLineString => NumericalFeature
case x if x == CategoricalFeature.headerLineString => CategoricalFeature
case _ => throw new IllegalArgumentException("File contains invalid feature type encoding (header line)")
}.toList
val featureTypes = lines
.next()
.split(",")
.map(x => removeQuotes(x))
.map {
case x if x == NumericalFeature.headerLineString => NumericalFeature
case x if x == CategoricalFeature.headerLineString => CategoricalFeature
case _ => throw new IllegalArgumentException("File contains invalid feature type encoding (header line)")
}
.toList

FeatureIndex(featureNames, featureTypes, featureNames.indices.toList)
}

private def loadWithNumericalFeatures(lines: Iterator[String],
na: String,
featureIndex: FeatureIndex): Array[Array[Double]] = {
lines.map(_.split(",").map { featureValue =>
val trimmedValue = removeQuotes(featureValue)
if (trimmedValue == na) Double.NaN else parseDouble(trimmedValue)
}).toArray
private def loadWithNumericalFeatures(
lines: Iterator[String],
na: String,
featureIndex: FeatureIndex
): Array[Array[Double]] = {
lines
.map(_.split(",").map { featureValue =>
val trimmedValue = removeQuotes(featureValue)
if (trimmedValue == na) Double.NaN else parseDouble(trimmedValue)
})
.toArray
}

private def loadWithMixedFeatures(lines: List[String],
na: String,
featureIndex: FeatureIndex): Array[Array[Double]] = {
private def loadWithMixedFeatures(
lines: List[String],
na: String,
featureIndex: FeatureIndex
): Array[Array[Double]] = {
val labelEncoder = inferLabelEncoder(lines, na, featureIndex)
lines.map { rowValues =>
rowValues.split(",").zipWithIndex.map { case (featureValue, columnIndex) =>
val trimmedValue = removeQuotes(featureValue)
featureIndex.types(columnIndex) match {
case _ if trimmedValue == na => Double.NaN
case NumericalFeature => parseDouble(trimmedValue)
case CategoricalFeature => labelEncoder.encode(trimmedValue, featureIndex.names(columnIndex))
}
rowValues.split(",").zipWithIndex.map {
case (featureValue, columnIndex) =>
val trimmedValue = removeQuotes(featureValue)
featureIndex.types(columnIndex) match {
case _ if trimmedValue == na => Double.NaN
case NumericalFeature => parseDouble(trimmedValue)
case CategoricalFeature => labelEncoder.encode(trimmedValue, featureIndex.names(columnIndex))
}
}
}.toArray
}
Expand All @@ -71,14 +84,17 @@ object CsvLoader {
private def inferLabelEncoder(lines: List[String], na: String, featureIndex: FeatureIndex): LabelEncoder = {
val encoder = mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]()
val categoricalFeatures = featureIndex.categorical
categoricalFeatures.names.foreach { name => encoder(name) = mutable.AnyRefMap[String, Double]() }
categoricalFeatures.names.foreach { name =>
encoder(name) = mutable.AnyRefMap[String, Double]()
}

lines.foreach { rowValues =>
val rowValuesArray = rowValues.split(",").map(x => removeQuotes(x))
categoricalFeatures.columnIndices.zip(categoricalFeatures.names).foreach { case (columnIndex, name) =>
val featureValue = rowValuesArray(columnIndex)
if (featureValue != na && !encoder(name).contains(featureValue))
encoder(name)(featureValue) = encoder(name).size.toDouble
categoricalFeatures.columnIndices.zip(categoricalFeatures.names).foreach {
case (columnIndex, name) =>
val featureValue = rowValuesArray(columnIndex)
if (featureValue != na && !encoder(name).contains(featureValue))
encoder(name)(featureValue) = encoder(name).size.toDouble
}
}

Expand All @@ -90,16 +106,15 @@ object CsvLoader {
*
* @param encoder a map containing mapping of categorical values to numerical values for all categorical features
*/
private class LabelEncoder(private val encoder: mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]) {
private class LabelEncoder(private val encoder: mutable.AnyRefMap[String, mutable.AnyRefMap[String, Double]]) {
def encode(featureValue: String, featureName: String): Double = encoder(featureName)(featureValue)
}

private def removeQuotes(s: String): String =
s.replaceAll("\"", "").replaceAll("'", "")

private def parseDouble(featureValue: String): Double = {
try
featureValue.toDouble
try featureValue.toDouble
catch {
case _: NumberFormatException =>
throw new IllegalArgumentException(
Expand Down
14 changes: 9 additions & 5 deletions src/main/scala/io/picnicml/doddlemodel/data/DatasetUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,20 @@ object DatasetUtils {
}

/** Splits the dataset into two subsets for training and testing and makes sure groups in each are non-overlapping. */
def splitDatasetWithGroups(x: Features,
y: Target,
groups: IntVector,
proportionTrain: Double = 0.5): GroupTrainTestSplit = {
def splitDatasetWithGroups(
x: Features,
y: Target,
groups: IntVector,
proportionTrain: Double = 0.5
): GroupTrainTestSplit = {
val numTrain = numberOfTrainExamplesBasedOnProportion(x.rows, proportionTrain)
val numSamplesPerGroup = hist(groups, numberOfUniqueGroups(groups)).hist.toArray
val (sortedNumSamplesPerGroup, toOriginalGroupIndex) = numSamplesPerGroup.zipWithIndex.sorted.unzip

val numGroupsInTrain = sortedNumSamplesPerGroup
.foldLeft(List(0)) { case (acc, currGroupSize) => (acc(0) + currGroupSize) :: acc }.reverse.drop(1)
.foldLeft(List(0)) { case (acc, currGroupSize) => (acc(0) + currGroupSize) :: acc }
.reverse
.drop(1)
.takeWhile(cumulativeNumSamples => cumulativeNumSamples <= numTrain)
.length

Expand Down
19 changes: 11 additions & 8 deletions src/main/scala/io/picnicml/doddlemodel/data/Feature.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ object Feature {
}

@SerialVersionUID(0L)
class FeatureIndex(val names: IndexedSeq[String],
val types: IndexedSeq[FeatureType],
val columnIndices: IndexedSeq[Int]) extends Serializable {
class FeatureIndex(
val names: IndexedSeq[String],
val types: IndexedSeq[FeatureType],
val columnIndices: IndexedSeq[Int]
) extends Serializable {

def categorical: FeatureIndex = onlyFeaturesOfType[CategoricalFeature.type]

Expand All @@ -32,15 +34,15 @@ object Feature {
val subsetIndices = this.types.zipWithIndex.flatMap {
case (t, i) => if (cls.isInstance(t)) i.some else none[Int]
}
subset(subsetIndices:_*)
subset(subsetIndices: _*)
}

def subset(names: String*): FeatureIndex = {
val nameToIndex = this.names.zipWithIndex.toMap
subset(names.map(n => nameToIndex(n)):_*)
subset(names.map(n => nameToIndex(n)): _*)
}

def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices:_*)
def subset(indices: IndexedSeq[Int]): FeatureIndex = subset(indices: _*)

// DummyImplicit is needed to avoid the same type as String* after erasure
def subset(indices: Int*)(implicit di: DummyImplicit): FeatureIndex = new FeatureIndex(
Expand All @@ -52,8 +54,9 @@ object Feature {
def drop(index: Int): FeatureIndex = new FeatureIndex(
this.names.zipWithIndex.flatMap { case (n, i) => if (i != index) n.some else none[String] },
this.types.zipWithIndex.flatMap { case (t, i) => if (i != index) t.some else none[FeatureType] },
this.columnIndices.zipWithIndex.flatMap { case (ci, i) =>
if (i == index) none[Int] else if (i > index) (ci - 1).some else ci.some
this.columnIndices.zipWithIndex.flatMap {
case (ci, i) =>
if (i == index) none[Int] else if (i > index) (ci - 1).some else ci.some
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@ object ResourceDatasetLoaders {

private def getBufferedSourceFromResource(path: String): BufferedSource = {
val resourceUrl = getClass.getResource(path)
val file = if (resourceUrl.toString.startsWith("jar:"))
// reads file from JAR
readResourceFileWithinJar(path)
else
// reads file when using IDE
new File(resourceUrl.getFile)
val file =
if (resourceUrl.toString.startsWith("jar:"))
// reads file from JAR
readResourceFileWithinJar(path)
else
// reads file when using IDE
new File(resourceUrl.getFile)
if (file != null && !file.exists)
throw new RuntimeException(s"Error: File $file not found!")
Source.fromFile(file)
Expand Down
14 changes: 8 additions & 6 deletions src/main/scala/io/picnicml/doddlemodel/data/data.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ package io.picnicml.doddlemodel.data

case class TrainTestSplit(xTr: Features, yTr: Target, xTe: Features, yTe: Target)

case class GroupTrainTestSplit(xTr: Features,
yTr: Target,
groupsTr: IntVector,
xTe: Features,
yTe: Target,
groupsTe: IntVector)
case class GroupTrainTestSplit(
xTr: Features,
yTr: Target,
groupsTr: IntVector,
xTe: Features,
yTe: Target,
groupsTe: IntVector
)
17 changes: 10 additions & 7 deletions src/main/scala/io/picnicml/doddlemodel/data/package.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.picnicml.doddlemodel

import breeze.linalg.{DenseMatrix, DenseVector, unique}
import breeze.linalg.{unique, DenseMatrix, DenseVector}
import io.picnicml.doddlemodel.CrossScalaCompat.doubleOrdering
import io.picnicml.doddlemodel.data.Feature.FeatureIndex

Expand All @@ -24,17 +24,20 @@ package object data {

def numberOfUniqueGroups(groups: IntVector): Int = {
val uniqueGroups = unique(groups)
require(uniqueGroups.toArray.sorted sameElements Array.range(0, uniqueGroups.length),
"Invalid encoding of groups, all group indices in [0, numGroups) have to exist")
require(
uniqueGroups.toArray.sorted sameElements Array.range(0, uniqueGroups.length),
"Invalid encoding of groups, all group indices in [0, numGroups) have to exist"
)
uniqueGroups.length
}

def numberOfTargetClasses(y: Target): Int = {
val targetClasses = unique(y)
require(targetClasses.length >= 2,
"Target variable must be comprised of at least two categories")
require(targetClasses.toArray.sorted sameElements Array.range(0, targetClasses.length),
"Invalid encoding of categories in the target variable")
require(targetClasses.length >= 2, "Target variable must be comprised of at least two categories")
require(
targetClasses.toArray.sorted sameElements Array.range(0, targetClasses.length),
"Invalid encoding of categories in the target variable"
)
targetClasses.length
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ object MostFrequentClassifier {
model.copy(numClasses = numClasses.some)

override protected def fitSafe(model: MostFrequentClassifier, x: Features, y: Target): MostFrequentClassifier = {
val mostFrequentClass = y.activeValuesIterator.foldLeft(Map[Double, Int]()) { (acc, x) =>
if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
}.toArray.sortBy(_._1).maxBy(_._2)._1
val mostFrequentClass = y.activeValuesIterator
.foldLeft(Map[Double, Int]()) { (acc, x) =>
if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
}
.toArray
.sortBy(_._1)
.maxBy(_._2)
._1

model.copy(mostFrequentClass = mostFrequentClass.some)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ object StratifiedClassifier {
model.copy(numClasses = numClasses.some)

override protected def fitSafe(model: StratifiedClassifier, x: Features, y: Target): StratifiedClassifier = {
val probs = y.activeValuesIterator.foldLeft(Map[Double, Int]()) { (acc, x) =>
if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
}.toArray.sortBy(_._1).map(_._2 / y.length.toDouble)
val probs = y.activeValuesIterator
.foldLeft(Map[Double, Int]()) { (acc, x) =>
if (acc.contains(x)) acc + (x -> (acc(x) + 1)) else acc + (x -> 1)
}
.toArray
.sortBy(_._1)
.map(_._2 / y.length.toDouble)

model.copy(targetDistr = Multinomial[RealVector, Int](DenseVector(probs)).some)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package io.picnicml.doddlemodel.dummy.classification

import breeze.linalg.{DenseVector, convert}
import breeze.linalg.{convert, DenseVector}
import breeze.stats.distributions.Rand
import cats.syntax.option._
import io.picnicml.doddlemodel.data.{Features, Simplex, Target}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ import io.picnicml.doddlemodel.typeclasses.Transformer
* val imputer = MeanValueImputer(featureIndex)
* val imputerSubsetOfColumns = MeanValueImputer(featureIndex.subset("f0", "f2"))
*/
case class MeanValueImputer private (private[impute] val means: Option[RealVector],
private val featureIndex: FeatureIndex)
case class MeanValueImputer private (
private[impute] val means: Option[RealVector],
private val featureIndex: FeatureIndex
)

object MeanValueImputer {

Expand All @@ -40,10 +42,11 @@ object MeanValueImputer {

override protected def transformSafe(model: MeanValueImputer, x: Features): Features = {
val xCopy = x.copy
model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { case (colIndex, statisticIndex) =>
xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex =>
xCopy(rowIndex, colIndex) = model.means.getOrBreak(statisticIndex)
}
model.featureIndex.numerical.columnIndices.zipWithIndex.foreach {
case (colIndex, statisticIndex) =>
xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex =>
xCopy(rowIndex, colIndex) = model.means.getOrBreak(statisticIndex)
}
}
xCopy
}
Expand Down
Loading