diff --git a/.gitignore b/.gitignore index 8fd3247cec..ec1a9ac15e 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,7 @@ node_modules/ .Rproj.user # R output -*.Rout \ No newline at end of file +*.Rout + +# Misc +.bsp diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 46b481c713..739b6065c4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -55,7 +55,7 @@ this process: #### Implement documentation -- Add a [sample Jupyter notebook](notebooks/samples) that shows the intended use +- Add a [sample Jupyter notebook](notebooks/) that shows the intended use case of your algorithm, with instructions in step-by-step manner. (The same notebook could be used for testing the code.) - Add in-line ScalaDoc comments to your source code, to generate the [API diff --git a/README.md b/README.md index 58c5cdcec6..f7618c97ea 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm | | | | |:--:|:--:|:--:| -| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | +| **Isolation Forest on Spark** | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN** | | Distributed Nonlinear Outlier Detection | Machine Learning Tools for Cyber Security | Scalable KNN Models with Conditional Queries | @@ -86,29 +86,29 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm - Train and evaluate a flight delay prediction system ([example 2]) - Finding anomalous data access patterns using the Access Anomalies package of CyberML ([example 11]) -See our [notebooks](notebooks/samples/) for all examples. +See our [notebooks](notebooks/) for all examples. -[example 1]: notebooks/samples/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" +[example 1]: notebooks/Classification%20-%20Adult%20Census.ipynb "Adult Census Income Training" -[example 2]: notebooks/samples/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" +[example 2]: notebooks/Regression%20-%20Flight%20Delays.ipynb "Regression Example with Flight Delay Dataset" -[example 3]: notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" +[example 3]: notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb "Quantile Regression with LightGBM" -[example 4]: notebooks/samples/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" +[example 4]: notebooks/TextAnalytics%20-%20Amazon%20Book%20Reviews.ipynb "Amazon Book Reviews - TextFeaturizer" -[example 5]: notebooks/samples/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" +[example 5]: notebooks/HyperParameterTuning%20-%20Fighting%20Breast%20Cancer.ipynb "Hyperparameter Tuning with MMLSpark" -[example 6]: notebooks/samples/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" +[example 6]: notebooks/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb "CIFAR10 CNTK CNN Evaluation" -[example 7]: notebooks/samples/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" +[example 7]: notebooks/OpenCV%20-%20Pipeline%20Image%20Transformations.ipynb "Pipeline Image Transformations" -[example 8]: notebooks/samples/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" +[example 8]: notebooks/DeepLearning%20-%20BiLSTM%20Medical%20Entity%20Extraction.ipynb "Medical Entity Extraction" -[example 9]: notebooks/samples/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" +[example 9]: notebooks/DeepLearning%20-%20Flower%20Image%20Classification.ipynb "Deep Flower Classification" [example 10]: notebooks/gpu/DeepLearning%20-%20Distributed%20CNTK%20training.ipynb "CIFAR10 CNTK CNN Training" -[example 11]: notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" +[example 11]: notebooks/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example" ## A short example @@ -127,7 +127,7 @@ scoredImages = cntkModel.transform(imagesWithLabels) ... ``` -See [other sample notebooks](notebooks/samples/) as well as the MMLSpark +See [other sample notebooks](notebooks/) as well as the MMLSpark documentation for [Scala](http://mmlspark.azureedge.net/docs/scala/) and [PySpark](http://mmlspark.azureedge.net/docs/pyspark/). diff --git a/build.sbt b/build.sbt index 0d10df561f..130abf8606 100644 --- a/build.sbt +++ b/build.sbt @@ -1,22 +1,20 @@ import java.io.{File, PrintWriter} import java.net.URL + import org.apache.commons.io.FileUtils import sbt.ExclusionRule -import sbt.internal.util.ManagedLogger import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} import scala.xml.transform.{RewriteRule, RuleTransformer} -import scala.sys.process.Process import BuildUtils._ +import CodegenPlugin.autoImport.pythonizedVersion +import sbt.Project.projectToRef +import xerial.sbt.Sonatype._ -val condaEnvName = "mmlspark" -name := "mmlspark" -organization := "com.microsoft.ml.spark" -scalaVersion := "2.12.10" +ThisBuild / organization := "com.microsoft.ml.spark" +ThisBuild / scalaVersion := "2.12.10" val sparkVersion = "3.0.1" -//val scalaMajorVersion = settingKey[String]("scalaMajorVersion") -//scalaMajorVersion := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")} val scalaMajorVersion = 2.12 val excludes = Seq( @@ -24,42 +22,28 @@ val excludes = Seq( ExclusionRule("org.scalatest") ) -libraryDependencies ++= Seq( +val coreDependencies = Seq( "org.apache.spark" %% "spark-core" % sparkVersion % "compile", "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile", "org.apache.spark" %% "spark-avro" % sparkVersion % "provided", "org.apache.spark" %% "spark-tags" % sparkVersion % "test", "org.scalatest" %% "scalatest" % "3.0.5" % "test") - -libraryDependencies ++= Seq( +val extraDependencies = Seq( "org.scalactic" %% "scalactic" % "3.0.5", "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.4", - "org.openpnp" % "opencv" % "3.2.0-1", "com.jcraft" % "jsch" % "0.1.54", - "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0", "org.apache.httpcomponents" % "httpclient" % "4.5.6", "org.apache.httpcomponents" % "httpmime" % "4.5.6", - "com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110", - "com.github.vowpalwabbit" % "vw-jni" % "8.9.1", "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1", ).map(d => d excludeAll (excludes: _*)) +val dependencies = coreDependencies ++ extraDependencies def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\"" -def activateCondaEnv: Seq[String] = { - if (sys.props("os.name").toLowerCase.contains("windows")) { - osPrefix ++ Seq("activate", condaEnvName, "&&") - } else { - Seq() - //TODO figure out why this doesent work - //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") - } -} - val omittedDeps = Set(s"spark-core_${scalaMajorVersion}", s"spark-mllib_${scalaMajorVersion}", "org.scala-lang") // skip dependency elements with a scope -pomPostProcess := { (node: XmlNode) => + +def pomPostFunc(node: XmlNode): scala.xml.Node = { new RuleTransformer(new RewriteRule { override def transform(node: XmlNode): XmlNodeSeq = node match { case e: Elem if e.label == "dependency" @@ -77,191 +61,17 @@ pomPostProcess := { (node: XmlNode) => }).transform(node).head } -resolvers += "Speech" at "https://mmlspark.blob.core.windows.net/maven/" - -val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") -createCondaEnvTask := { - val s = streams.value - val hasEnv = Process("conda env list").lineStream.toList - .map(_.split("\\s+").head).contains(condaEnvName) - if (!hasEnv) { - runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) - } else { - println("Found conda env " + condaEnvName) - } -} - -val condaEnvLocation = TaskKey[String]("condaEnvLocation", "get install location of conda env") -condaEnvLocation := { - val s = streams.value - createCondaEnvTask.value - Process("conda env list").lineStream.toList - .map(_.split("\\s+")) - .map(l => (l.head, l.reverse.head)) - .filter(p => p._1 == condaEnvName) - .head._2 -} - - -val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") -cleanCondaEnvTask := { - runCmd(Seq("conda", "env", "remove", "--name", condaEnvName, "-y")) -} - -val codegenTask = TaskKey[Unit]("codegen", "Generate Code") -codegenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value -} - -val testgenTask = TaskKey[Unit]("testgen", "Generate Tests") -testgenTask := { - (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.TestGen").value -} - -val genDir = join("target", s"scala-${scalaMajorVersion}", "generated") -val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc") -val pythonSrcDir = join(genDir.toString, "src", "python") -val unifiedDocDir = join(genDir.toString, "doc") -val pythonDocDir = join(unifiedDocDir.toString, "pyspark") -val pythonPackageDir = join(genDir.toString, "package", "python") -val pythonTestDir = join(genDir.toString, "test", "python") -val rSrcDir = join(genDir.toString, "src", "R", "mmlspark") -val rPackageDir = join(genDir.toString, "package", "R") - -val pythonizedVersion = settingKey[String]("Pythonized version") -pythonizedVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head + ".dev1" - } else { - version.value - } -} - -val rVersion = settingKey[String]("R version") -rVersion := { - if (version.value.contains("-")) { - version.value.split("-".head).head - } else { - version.value - } -} - -def rCmd(cmd: Seq[String], wd: File, libPath: String): Unit = { - runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) -} - -val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") -packageR := { - createCondaEnvTask.value - codegenTask.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) - rPackageDir.mkdirs() - zipFolder(rSrcDir, new File(rPackageDir, s"mmlspark-${version.value}.zip")) -} - -val testR = TaskKey[Unit]("testR", "Run testthat on R tests") -testR := { - packageR.value - publishLocal.value - val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString - rCmd(Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", "mmlspark"), rSrcDir.getParentFile, libPath) - val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath - rCmd(Seq("Rscript", testRunner), rSrcDir, libPath) -} - -val publishR = TaskKey[Unit]("publishR", "publish R package to blob") -publishR := { - codegenTask.value - packageR.value - val rPackage = rPackageDir.listFiles().head - singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") -} - -val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk") -packagePythonTask := { - codegenTask.value - createCondaEnvTask.value - val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark") - if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) - FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir) - runCmd( - activateCondaEnv ++ - Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", s"${pythonPackageDir.absolutePath}"), - pythonSrcDir) -} - -val installPipPackageTask = TaskKey[Unit]("installPipPackage", "install python sdk") -installPipPackageTask := { - packagePythonTask.value - publishLocal.value - runCmd( - activateCondaEnv ++ Seq("pip", "install", "-I", - s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl"), - pythonPackageDir) -} - -val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") -generatePythonDoc := { - installPipPackageTask.value - runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), - join(pythonSrcDir.toString, "mmlspark")) - runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), - join(pythonSrcDir.toString, "mmlspark")) -} - -val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") -publishDocs := { - generatePythonDoc.value - (Compile / unidoc).value - val html = - """ - |
-      |pyspark/
-      |scala/
-      |
- """.stripMargin - val scalaDir = join(unifiedDocDir.toString, "scala") - if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) - FileUtils.copyDirectory(unidocDir, scalaDir) - FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") - uploadToBlob(unifiedDocDir.toString, version.value, "docs") -} - -val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") -publishPython := { - publishLocal.value - packagePythonTask.value - singleUploadToBlob( - join(pythonPackageDir.toString, s"mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl").toString, - version.value + s"/mmlspark-${pythonizedVersion.value}-py2.py3-none-any.whl", - "pip") -} +pomPostProcess := pomPostFunc -val testPythonTask = TaskKey[Unit]("testPython", "test python sdk") - -testPythonTask := { - installPipPackageTask.value - testgenTask.value - runCmd( - activateCondaEnv ++ Seq("python", - "-m", - "pytest", - "--cov=mmlspark", - "--junitxml=../../../../python-test-results.xml", - "--cov-report=xml", - "mmlsparktest" - ), - new File(s"target/scala-${scalaMajorVersion}/generated/test/python/") - ) -} +val speechResolver = "Speech" at "https://mmlspark.blob.core.windows.net/maven/" val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing") val datasetName = "datasets-2020-08-27.tgz" val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName") val datasetDir = settingKey[File]("The directory that holds the dataset") -datasetDir := { - join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head) +ThisBuild / datasetDir := { + join(artifactPath.in(packageBin).in(Compile).value.getParentFile, + "datasets", datasetName.split(".".toCharArray.head).head) } getDatasetsTask := { @@ -276,48 +86,61 @@ getDatasetsTask := { val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file") genBuildInfo := { - val buildInfo = + val docInfo = s""" - |MMLSpark Build and Release Information - |--------------- - | - |### Maven Coordinates - | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}` - | - |### Maven Resolver - | `https://mmlspark.azureedge.net/maven` | |### Documentation Pages: |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html) |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html) | """.stripMargin + val buildInfo = (root / blobArtifactInfo).value + docInfo val infoFile = join("target", "Build.md") if (infoFile.exists()) FileUtils.forceDelete(infoFile) FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8") } -val setupTask = TaskKey[Unit]("setup", "set up library for intellij") -setupTask := { - (Compile / compile).toTask.value - (Test / compile).toTask.value - getDatasetsTask.value +val rootGenDir = SettingKey[File]("rootGenDir") +rootGenDir := { + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + join(targetDir, "generated") } -val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") -publishBlob := { - publishM2.value - val scalaVersionSuffix = scalaVersion.value.split(".".toCharArray.head).dropRight(1).mkString(".") - val nameAndScalaVersion = s"${name.value}_$scalaVersionSuffix" - - val localPackageFolder = join( - Seq(new File(new URI(Resolver.mavenLocal.root)).getAbsolutePath) - ++ organization.value.split(".".toCharArray.head) - ++ Seq(nameAndScalaVersion, version.value): _*).toString +val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python") +generatePythonDoc := { + installPipPackage.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile))).value + mergePyCode.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile)) + ).value + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val dir = join(codegenDir, "src", "python", "mmlspark") + runCmd(activateCondaEnv.value ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir) + runCmd(activateCondaEnv.value ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir) +} - val blobMavenFolder = organization.value.replace(".", "/") + - s"/$nameAndScalaVersion/${version.value}" - uploadToBlob(localPackageFolder, blobMavenFolder, "maven") +val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python") +publishDocs := { + //generatePythonDoc.value + (root / Compile / unidoc).value + val html = + """ + |
+      |pyspark/
+      |scala/
+      |
+ """.stripMargin + val targetDir = artifactPath.in(packageBin).in(Compile).in(root).value.getParentFile + val codegenDir = join(targetDir, "generated") + val unifiedDocDir = join(codegenDir, "doc") + val scalaDir = join(unifiedDocDir.toString, "scala") + if (scalaDir.exists()) FileUtils.forceDelete(scalaDir) + FileUtils.copyDirectory(join(targetDir, "unidoc"), scalaDir) + FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8") + uploadToBlob(unifiedDocDir.toString, version.value, "docs") } val release = TaskKey[Unit]("release", "publish the library to mmlspark blob") @@ -355,11 +178,8 @@ publishBadges := { } val settings = Seq( - (scalastyleConfig in Test) := baseDirectory.value / "scalastyle-test-config.xml", + (scalastyleConfig in Test) := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml", logBuffered in Test := false, - buildInfoKeys := Seq[BuildInfoKey]( - name, version, scalaVersion, sbtVersion, - baseDirectory, datasetDir, pythonizedVersion, rVersion), parallelExecution in Test := false, test in assembly := {}, assemblyMergeStrategy in assembly := { @@ -367,14 +187,84 @@ val settings = Seq( case x => MergeStrategy.first }, assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false), - buildInfoPackage := "com.microsoft.ml.spark.build") - -lazy val mmlspark = (project in file(".")) - .enablePlugins(BuildInfoPlugin) - .enablePlugins(ScalaUnidocPlugin) - .settings(settings: _*) + autoAPIMappings := true, + pomPostProcess := pomPostFunc, +) +ThisBuild / publishMavenStyle := true + +lazy val core = (project in file("core")) + .enablePlugins(BuildInfoPlugin && SbtPlugin) + .settings((settings ++ Seq( + libraryDependencies ++= dependencies, + buildInfoKeys ++= Seq[BuildInfoKey]( + datasetDir, + version, + scalaVersion, + sbtVersion, + baseDirectory + ), + name := "mmlspark-core", + buildInfoPackage := "com.microsoft.ml.spark.build", + )): _*) + +lazy val deepLearning = (project in file("deep-learning")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cntk" % "cntk" % "2.4"), + name := "mmlspark-deep-learning", + )): _*) + +lazy val lightgbm = (project in file("lightgbm")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.2.110"), + name := "mmlspark-lightgbm" + )): _*) + +lazy val vw = (project in file("vw")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.github.vowpalwabbit" % "vw-jni" % "8.9.1"), + name := "mmlspark-vw" + )): _*) + +lazy val cognitive = (project in file("cognitive")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.14.0"), + resolvers += speechResolver, + name := "mmlspark-cognitive" + )): _*) + +lazy val opencv = (project in file("opencv")) + .enablePlugins(SbtPlugin) + .dependsOn(core % "test->test;compile->compile") + .settings((settings ++ Seq( + libraryDependencies += ("org.openpnp" % "opencv" % "3.2.0-1"), + name := "mmlspark-opencv" + )): _*) + +lazy val root = (project in file(".")) + .aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv) + .dependsOn(core, deepLearning, cognitive, vw, lightgbm, opencv) + .enablePlugins(ScalaUnidocPlugin && SbtPlugin) + .disablePlugins(CodegenPlugin) + .settings(settings ++ Seq( + name := "mmlspark", + )) -import xerial.sbt.Sonatype._ +val setupTask = TaskKey[Unit]("setup", "set up library for intellij") +setupTask := { + compile.all(ScopeFilter( + inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv), + inConfigurations(Compile, Test)) + ).value + getDatasetsTask.value +} sonatypeProjectHosting := Some( GitHubHosting("Azure", "MMLSpark", "mmlspark-support@microsot.com")) @@ -389,33 +279,30 @@ developers := List( ) licenses += ("MIT", url("https://github.com/Azure/mmlspark/blob/master/LICENSE")) -publishMavenStyle := true - -credentials += Credentials("Sonatype Nexus Repository Manager", - "oss.sonatype.org", - Secrets.nexusUsername, - Secrets.nexusPassword) - -pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) -pgpSecretRing := { - val temp = File.createTempFile("secret", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPrivate); - close() - } - temp -} -pgpPublicRing := { - val temp = File.createTempFile("public", ".asc") - new PrintWriter(temp) { - write(Secrets.pgpPublic); - close() - } - temp -} + +//credentials += Credentials("Sonatype Nexus Repository Manager", +// "oss.sonatype.org", +// Secrets.nexusUsername, +// Secrets.nexusPassword) +// +//pgpPassphrase := Some(Secrets.pgpPassword.toCharArray) +//pgpSecretRing := { +// val temp = File.createTempFile("secret", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPrivate); +// close() +// } +// temp +//} +//pgpPublicRing := { +// val temp = File.createTempFile("public", ".asc") +// new PrintWriter(temp) { +// write(Secrets.pgpPublic); +// close() +// } +// temp +//} +//publishTo := sonatypePublishToBundle.value dynverSonatypeSnapshots in ThisBuild := true dynverSeparator in ThisBuild := "-" -publishTo := sonatypePublishToBundle.value - -// Break Cache - 1 diff --git a/src/main/python/mmlspark/cognitive/AzureSearchWriter.py b/cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py similarity index 100% rename from src/main/python/mmlspark/cognitive/AzureSearchWriter.py rename to cognitive/src/main/python/mmlspark/cognitive/AzureSearchWriter.py diff --git a/src/main/python/mmlspark/cognitive/BingImageSearch.py b/cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py similarity index 100% rename from src/main/python/mmlspark/cognitive/BingImageSearch.py rename to cognitive/src/main/python/mmlspark/cognitive/BingImageSearch.py diff --git a/src/__init__.py b/cognitive/src/main/python/mmlspark/cognitive/__init__.py similarity index 100% rename from src/__init__.py rename to cognitive/src/main/python/mmlspark/cognitive/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnamolyDetection.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AnomalyDetectorSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AudioStreams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala index 96024a68b6..b405bb13b0 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearch.scala @@ -143,7 +143,8 @@ object AzureSearchWriter extends IndexParser with SLogging { val Logger: Logger = LogManager.getRootLogger - private def checkForErrors(fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { + private def checkForErrors( + fatal: Boolean)(errorRow: Row, inputRow: Row): Option[Row] = { Option(errorRow).map { r => val message = s"Service Exception:\n\t ${r.toString()} \n for input:\n\t ${inputRow.toString()}" if (fatal) { diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/AzureSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/BingImageSearch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVision.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ComputerVisionSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/Face.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/FaceSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/ImageSearchSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/RESTHelpers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToText.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala index 51a965b0d0..45447ac5f2 100644 --- a/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala +++ b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/SpeechToTextSDK.scala @@ -8,15 +8,17 @@ import java.lang.ProcessBuilder.Redirect import java.net.{URI, URL} import java.util.UUID import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} + import com.microsoft.cognitiveservices.speech._ import com.microsoft.cognitiveservices.speech.audio._ -import com.microsoft.cognitiveservices.speech.transcription.{Conversation, ConversationTranscriber, - ConversationTranscriptionEventArgs, Participant} +import com.microsoft.cognitiveservices.speech.transcription.{ + Conversation, ConversationTranscriber, ConversationTranscriptionEventArgs, Participant} import com.microsoft.cognitiveservices.speech.util.EventHandler import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.cognitive.SpeechFormat._ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.{DatasetExtensions, SparkBindings} +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.io.http.HasURL import com.microsoft.ml.spark.logging.BasicLogging import com.microsoft.ml.spark.{CompressedStream, WavStream} @@ -36,10 +38,6 @@ import spray.json._ import scala.concurrent.{ExecutionContext, Future, blocking} import scala.language.existentials -object OsUtils { - val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 -} - object SpeechToTextSDK extends ComplexParamsReadable[SpeechToTextSDK] private[ml] class BlockingQueueIterator[T](lbq: LinkedBlockingQueue[Option[T]], diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalytics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala rename to cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemasV2.scala diff --git a/src/main/__init__.py b/cognitive/src/test/python/mmlsparktest/cognitive/__init__.py similarity index 100% rename from src/main/__init__.py rename to cognitive/src/test/python/mmlsparktest/cognitive/__init__.py diff --git a/src/test/python/mmlsparktest/cognitive/test_simple.py b/cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/test_simple.py rename to cognitive/src/test/python/mmlsparktest/cognitive/test_simple.py diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/AnamolyDetectionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala index 11a75834a4..6255d9462b 100644 --- a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala @@ -9,12 +9,10 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase} import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.NamespaceInjections.pipelineModel import org.apache.spark.ml.util.MLReadable -import org.apache.spark.sql.functions.{corr, typedLit} +import org.apache.spark.sql.functions.typedLit import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.scalactic.Equality -import org.scalatest.Assertion import com.microsoft.ml.spark.FluentAPI._ -import com.microsoft.ml.spark.featurize.text.PageSplitter trait CognitiveKey { lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey) diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceAPI.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/FaceSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ImageSearchSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split1/TextAnalyticsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SearchWriterSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split2/SpeechToTextSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/cognitive/split3/SpeechToTextSDKSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala similarity index 94% rename from src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala rename to cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala index 620ab98aa2..d88d70d63a 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/ModelEqualitySuite.scala +++ b/cognitive/src/test/scala/com/microsoft/ml/spark/core/utils/utils/ModelEqualitySuite.scala @@ -1,11 +1,12 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package com.microsoft.ml.spark.core.utils +package com.microsoft.ml.spark.core.utils.utils import com.microsoft.ml.spark.cognitive.TextSentiment import com.microsoft.ml.spark.core.env.FileUtilities.join import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.stages.DropColumns class ModelEqualitySuite extends TestBase { diff --git a/src/main/python/LICENSE.txt b/core/src/main/python/LICENSE.txt similarity index 100% rename from src/main/python/LICENSE.txt rename to core/src/main/python/LICENSE.txt diff --git a/src/main/python/MANIFEST.in b/core/src/main/python/MANIFEST.in similarity index 100% rename from src/main/python/MANIFEST.in rename to core/src/main/python/MANIFEST.in diff --git a/src/main/python/__init__.py b/core/src/main/python/__init__.py similarity index 100% rename from src/main/python/__init__.py rename to core/src/main/python/__init__.py diff --git a/src/main/python/mmlspark/README.txt b/core/src/main/python/mmlspark/README.txt similarity index 100% rename from src/main/python/mmlspark/README.txt rename to core/src/main/python/mmlspark/README.txt diff --git a/src/main/python/mmlspark/__init__.py b/core/src/main/python/mmlspark/__init__.py similarity index 100% rename from src/main/python/mmlspark/__init__.py rename to core/src/main/python/mmlspark/__init__.py diff --git a/src/main/python/mmlspark/automl/BestModel.py b/core/src/main/python/mmlspark/automl/BestModel.py similarity index 100% rename from src/main/python/mmlspark/automl/BestModel.py rename to core/src/main/python/mmlspark/automl/BestModel.py diff --git a/src/main/python/mmlspark/automl/HyperparamBuilder.py b/core/src/main/python/mmlspark/automl/HyperparamBuilder.py similarity index 100% rename from src/main/python/mmlspark/automl/HyperparamBuilder.py rename to core/src/main/python/mmlspark/automl/HyperparamBuilder.py diff --git a/src/main/python/mmlspark/automl/TuneHyperparametersModel.py b/core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py similarity index 100% rename from src/main/python/mmlspark/automl/TuneHyperparametersModel.py rename to core/src/main/python/mmlspark/automl/TuneHyperparametersModel.py diff --git a/src/main/python/mmlspark/automl/__init__.py b/core/src/main/python/mmlspark/automl/__init__.py similarity index 100% rename from src/main/python/mmlspark/automl/__init__.py rename to core/src/main/python/mmlspark/automl/__init__.py diff --git a/src/main/python/mmlspark/cntk/__init__.py b/core/src/main/python/mmlspark/core/__init__.py similarity index 100% rename from src/main/python/mmlspark/cntk/__init__.py rename to core/src/main/python/mmlspark/core/__init__.py diff --git a/src/main/python/mmlspark/core/schema/TypeConversionUtils.py b/core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/TypeConversionUtils.py rename to core/src/main/python/mmlspark/core/schema/TypeConversionUtils.py diff --git a/src/main/python/mmlspark/core/schema/Utils.py b/core/src/main/python/mmlspark/core/schema/Utils.py similarity index 100% rename from src/main/python/mmlspark/core/schema/Utils.py rename to core/src/main/python/mmlspark/core/schema/Utils.py diff --git a/src/main/python/mmlspark/cognitive/__init__.py b/core/src/main/python/mmlspark/core/schema/__init__.py similarity index 100% rename from src/main/python/mmlspark/cognitive/__init__.py rename to core/src/main/python/mmlspark/core/schema/__init__.py diff --git a/src/main/python/mmlspark/core/__init__.py b/core/src/main/python/mmlspark/core/serialize/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/__init__.py rename to core/src/main/python/mmlspark/core/serialize/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/java_params_patch.py b/core/src/main/python/mmlspark/core/serialize/java_params_patch.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/java_params_patch.py rename to core/src/main/python/mmlspark/core/serialize/java_params_patch.py diff --git a/src/main/python/mmlspark/core/spark/FluentAPI.py b/core/src/main/python/mmlspark/core/spark/FluentAPI.py similarity index 100% rename from src/main/python/mmlspark/core/spark/FluentAPI.py rename to core/src/main/python/mmlspark/core/spark/FluentAPI.py diff --git a/src/main/python/mmlspark/core/schema/__init__.py b/core/src/main/python/mmlspark/core/spark/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/schema/__init__.py rename to core/src/main/python/mmlspark/core/spark/__init__.py diff --git a/src/main/python/mmlspark/core/serialize/__init__.py b/core/src/main/python/mmlspark/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/serialize/__init__.py rename to core/src/main/python/mmlspark/cyber/__init__.py diff --git a/src/main/python/mmlspark/core/spark/__init__.py b/core/src/main/python/mmlspark/cyber/anomaly/__init__.py similarity index 100% rename from src/main/python/mmlspark/core/spark/__init__.py rename to core/src/main/python/mmlspark/cyber/anomaly/__init__.py diff --git a/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py b/core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py rename to core/src/main/python/mmlspark/cyber/anomaly/collaborative_filtering.py diff --git a/src/main/python/mmlspark/cyber/anomaly/complement_access.py b/core/src/main/python/mmlspark/cyber/anomaly/complement_access.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/complement_access.py rename to core/src/main/python/mmlspark/cyber/anomaly/complement_access.py diff --git a/src/main/python/mmlspark/cyber/dataset.py b/core/src/main/python/mmlspark/cyber/dataset.py similarity index 100% rename from src/main/python/mmlspark/cyber/dataset.py rename to core/src/main/python/mmlspark/cyber/dataset.py diff --git a/src/main/python/mmlspark/cyber/__init__.py b/core/src/main/python/mmlspark/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/__init__.py rename to core/src/main/python/mmlspark/cyber/feature/__init__.py diff --git a/src/main/python/mmlspark/cyber/feature/indexers.py b/core/src/main/python/mmlspark/cyber/feature/indexers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/indexers.py rename to core/src/main/python/mmlspark/cyber/feature/indexers.py diff --git a/src/main/python/mmlspark/cyber/feature/scalers.py b/core/src/main/python/mmlspark/cyber/feature/scalers.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/scalers.py rename to core/src/main/python/mmlspark/cyber/feature/scalers.py diff --git a/src/main/python/mmlspark/cyber/anomaly/__init__.py b/core/src/main/python/mmlspark/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/anomaly/__init__.py rename to core/src/main/python/mmlspark/cyber/utils/__init__.py diff --git a/src/main/python/mmlspark/cyber/utils/spark_utils.py b/core/src/main/python/mmlspark/cyber/utils/spark_utils.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/spark_utils.py rename to core/src/main/python/mmlspark/cyber/utils/spark_utils.py diff --git a/src/main/python/mmlspark/doc/conf.py b/core/src/main/python/mmlspark/doc/conf.py similarity index 100% rename from src/main/python/mmlspark/doc/conf.py rename to core/src/main/python/mmlspark/doc/conf.py diff --git a/src/main/python/mmlspark/doc/index.rst b/core/src/main/python/mmlspark/doc/index.rst similarity index 100% rename from src/main/python/mmlspark/doc/index.rst rename to core/src/main/python/mmlspark/doc/index.rst diff --git a/src/main/python/mmlspark/doc/scala.rst b/core/src/main/python/mmlspark/doc/scala.rst similarity index 100% rename from src/main/python/mmlspark/doc/scala.rst rename to core/src/main/python/mmlspark/doc/scala.rst diff --git a/src/main/python/mmlspark/downloader/ModelDownloader.py b/core/src/main/python/mmlspark/downloader/ModelDownloader.py similarity index 100% rename from src/main/python/mmlspark/downloader/ModelDownloader.py rename to core/src/main/python/mmlspark/downloader/ModelDownloader.py diff --git a/src/main/python/mmlspark/cyber/feature/__init__.py b/core/src/main/python/mmlspark/downloader/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/feature/__init__.py rename to core/src/main/python/mmlspark/downloader/__init__.py diff --git a/src/main/python/mmlspark/io/IOImplicits.py b/core/src/main/python/mmlspark/io/IOImplicits.py similarity index 100% rename from src/main/python/mmlspark/io/IOImplicits.py rename to core/src/main/python/mmlspark/io/IOImplicits.py diff --git a/src/main/python/mmlspark/cyber/utils/__init__.py b/core/src/main/python/mmlspark/io/__init__.py similarity index 100% rename from src/main/python/mmlspark/cyber/utils/__init__.py rename to core/src/main/python/mmlspark/io/__init__.py diff --git a/src/main/python/mmlspark/io/binary/BinaryFileReader.py b/core/src/main/python/mmlspark/io/binary/BinaryFileReader.py similarity index 100% rename from src/main/python/mmlspark/io/binary/BinaryFileReader.py rename to core/src/main/python/mmlspark/io/binary/BinaryFileReader.py diff --git a/src/main/python/mmlspark/downloader/__init__.py b/core/src/main/python/mmlspark/io/binary/__init__.py similarity index 100% rename from src/main/python/mmlspark/downloader/__init__.py rename to core/src/main/python/mmlspark/io/binary/__init__.py diff --git a/src/main/python/mmlspark/io/http/HTTPFunctions.py b/core/src/main/python/mmlspark/io/http/HTTPFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/HTTPFunctions.py rename to core/src/main/python/mmlspark/io/http/HTTPFunctions.py diff --git a/src/main/python/mmlspark/io/http/JSONOutputParser.py b/core/src/main/python/mmlspark/io/http/JSONOutputParser.py similarity index 100% rename from src/main/python/mmlspark/io/http/JSONOutputParser.py rename to core/src/main/python/mmlspark/io/http/JSONOutputParser.py diff --git a/src/main/python/mmlspark/io/http/ServingFunctions.py b/core/src/main/python/mmlspark/io/http/ServingFunctions.py similarity index 100% rename from src/main/python/mmlspark/io/http/ServingFunctions.py rename to core/src/main/python/mmlspark/io/http/ServingFunctions.py diff --git a/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py b/core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py similarity index 100% rename from src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py rename to core/src/main/python/mmlspark/io/http/SimpleHTTPTransformer.py diff --git a/src/main/python/mmlspark/image/__init__.py b/core/src/main/python/mmlspark/io/http/__init__.py similarity index 100% rename from src/main/python/mmlspark/image/__init__.py rename to core/src/main/python/mmlspark/io/http/__init__.py diff --git a/src/main/python/mmlspark/io/image/ImageUtils.py b/core/src/main/python/mmlspark/io/image/ImageUtils.py similarity index 100% rename from src/main/python/mmlspark/io/image/ImageUtils.py rename to core/src/main/python/mmlspark/io/image/ImageUtils.py diff --git a/src/main/python/mmlspark/io/__init__.py b/core/src/main/python/mmlspark/io/image/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/__init__.py rename to core/src/main/python/mmlspark/io/image/__init__.py diff --git a/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py b/core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/PowerBIWriter.py rename to core/src/main/python/mmlspark/io/powerbi/PowerBIWriter.py diff --git a/src/main/python/mmlspark/io/binary/__init__.py b/core/src/main/python/mmlspark/io/powerbi/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/binary/__init__.py rename to core/src/main/python/mmlspark/io/powerbi/__init__.py diff --git a/src/main/python/mmlspark/nn/ConditionalBallTree.py b/core/src/main/python/mmlspark/nn/ConditionalBallTree.py similarity index 100% rename from src/main/python/mmlspark/nn/ConditionalBallTree.py rename to core/src/main/python/mmlspark/nn/ConditionalBallTree.py diff --git a/src/main/python/mmlspark/io/http/__init__.py b/core/src/main/python/mmlspark/nn/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/http/__init__.py rename to core/src/main/python/mmlspark/nn/__init__.py diff --git a/src/main/python/mmlspark/io/image/__init__.py b/core/src/main/python/mmlspark/plot/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/image/__init__.py rename to core/src/main/python/mmlspark/plot/__init__.py diff --git a/src/main/python/mmlspark/plot/plot.py b/core/src/main/python/mmlspark/plot/plot.py similarity index 100% rename from src/main/python/mmlspark/plot/plot.py rename to core/src/main/python/mmlspark/plot/plot.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplit.py diff --git a/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py b/core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py rename to core/src/main/python/mmlspark/recommendation/RankingTrainValidationSplitModel.py diff --git a/src/main/python/mmlspark/recommendation/SARModel.py b/core/src/main/python/mmlspark/recommendation/SARModel.py similarity index 100% rename from src/main/python/mmlspark/recommendation/SARModel.py rename to core/src/main/python/mmlspark/recommendation/SARModel.py diff --git a/src/main/python/mmlspark/recommendation/__init__.py b/core/src/main/python/mmlspark/recommendation/__init__.py similarity index 100% rename from src/main/python/mmlspark/recommendation/__init__.py rename to core/src/main/python/mmlspark/recommendation/__init__.py diff --git a/src/main/python/mmlspark/stages/UDFTransformer.py b/core/src/main/python/mmlspark/stages/UDFTransformer.py similarity index 100% rename from src/main/python/mmlspark/stages/UDFTransformer.py rename to core/src/main/python/mmlspark/stages/UDFTransformer.py diff --git a/src/main/python/mmlspark/io/powerbi/__init__.py b/core/src/main/python/mmlspark/stages/__init__.py similarity index 100% rename from src/main/python/mmlspark/io/powerbi/__init__.py rename to core/src/main/python/mmlspark/stages/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/DefaultHyperparams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/EvaluationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/FindBestModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/HyperparamBuilder.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/ParamSpace.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.scala diff --git a/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt b/core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt rename to core/src/main/scala/com/microsoft/ml/spark/automl/TuneHyperparameters.txt diff --git a/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala new file mode 100644 index 0000000000..72de88bd22 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodeGen.scala @@ -0,0 +1,202 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import org.apache.commons.io.FileUtils +import org.apache.commons.io.FilenameUtils._ +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import spray.json._ + +object CodeGenUtils { + def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) + + def toDir(f: File): File = new File(f, File.separator) +} + + +object CodeGen { + + import CodeGenUtils._ + + def generatePythonClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[PythonWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makePyFile(conf) + } + } + + def generateRClasses(conf: CodegenConfig): Unit = { + val instantiatedClasses = instantiateServices[RWrappable](conf.jarName) + instantiatedClasses.foreach { w => + println(w.getClass.getName) + w.makeRFile(conf) + } + } + + private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = { + val dir = new File(new File(conf.pySrcDir, "mmlspark"), packageFolder) + val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else "" + val importStrings = + dir.listFiles.filter(_.isFile).sorted + .map(_.getName) + .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test")) + .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("") + val initFile = new File(dir, "__init__.py") + if (packageFolder != "") { + writeFile(initFile, conf.packageHelp(importStrings)) + } else if (initFile.exists()) { + initFile.delete() + } + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(conf, packageFolder + "/" + f.getName) + ) + } + + //noinspection ScalaStyle + def generateRPackageData(conf: CodegenConfig): Unit = { + // description file; need to encode version as decimal + val today = new java.text.SimpleDateFormat("yyyy-MM-dd") + .format(new java.util.Date()) + + conf.rSrcDir.mkdirs() + writeFile(new File(conf.rSrcDir.getParentFile, "DESCRIPTION"), + s"""|Package: ${conf.name} + |Title: Access to MMLSpark via R + |Description: Provides an interface to MMLSpark. + |Version: ${conf.rVersion} + |Date: $today + |Author: Microsoft Corporation + |Maintainer: MMLSpark Team + |URL: https://github.com/Azure/mmlspark + |BugReports: https://github.com/Azure/mmlspark/issues + |Depends: + | R (>= 2.12.0) + |Imports: + | sparklyr + |License: MIT + |Suggests: + | testthat (>= 3.0.0) + |Config/testthat/edition: 3 + |""".stripMargin) + + writeFile(new File(conf.rSrcDir, "package_register.R"), + s"""|#' @import sparklyr + |spark_dependencies <- function(spark_version, scala_version, ...) { + | spark_dependency( + | jars = c(), + | packages = c( + | "com.microsoft.ml.spark:${conf.name}:${conf.version}" + | ), + | repositories = c("https://mmlspark.azureedge.net/maven") + | ) + |} + | + |#' @import sparklyr + |.onLoad <- function(libname, pkgname) { + | sparklyr::register_extension(pkgname) + |} + |""".stripMargin) + + writeFile(new File(conf.rSrcDir.getParentFile, "mmlspark.Rproj"), + """ + |Version: 1.0 + | + |RestoreWorkspace: Default + |SaveWorkspace: Default + |AlwaysSaveHistory: Default + | + |EnableCodeIndexing: Yes + |UseSpacesForTab: Yes + |NumSpacesForTab: 4 + |Encoding: UTF-8 + | + |RnwWeave: Sweave + |LaTeX: pdfLaTeX + | + |BuildType: Package + |PackageUseDevtools: Yes + |PackageInstallArgs: --no-multiarch --with-keep.source + | + |""".stripMargin) + + } + + //noinspection ScalaStyle + def generatePyPackageData(conf: CodegenConfig): Unit = { + if (!conf.pySrcDir.exists()) { + conf.pySrcDir.mkdir() + } + writeFile(join(conf.pySrcDir, "setup.py"), + s""" + |# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + | + |import os + |from setuptools import setup, find_namespace_packages + |import codecs + |import os.path + | + |setup( + | name="${conf.name}", + | version="${conf.pythonizedVersion}", + | description="Microsoft ML for Spark", + | long_description="Microsoft ML for Apache Spark contains Microsoft's open source " + | + "contributions to the Apache Spark ecosystem", + | license="MIT", + | packages=find_namespace_packages(include=['mmlspark.*']), + | url="https://github.com/Azure/mmlspark", + | author="Microsoft", + | author_email="mmlspark-support@microsoft.com", + | classifiers=[ + | "Development Status :: 4 - Beta", + | "Intended Audience :: Developers", + | "Intended Audience :: Data Scientists", + | "Topic :: Software Development :: Datascience Tools", + | "License :: OSI Approved :: MIT License", + | "Programming Language :: Python :: 2", + | "Programming Language :: Python :: 3", + | ], + | zip_safe=True, + | package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, + |) + | + |""".stripMargin) + } + + + def rGen(conf: CodegenConfig): Unit = { + println(s"Generating R for ${conf.jarName}") + clean(conf.rSrcRoot) + generateRPackageData(conf) + generateRClasses(conf) + if (conf.rSrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.rSrcOverrideDir), toDir(conf.rSrcDir)) + if (conf.rTestOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.rTestOverrideDir), toDir(conf.rTestDir)) + } + + def pyGen(conf: CodegenConfig): Unit = { + println(s"Generating python for ${conf.jarName}") + clean(conf.pySrcDir) + generatePyPackageData(conf) + generatePythonClasses(conf) + if (conf.pySrcOverrideDir.exists()) + FileUtils.copyDirectoryToDirectory(toDir(conf.pySrcOverrideDir), toDir(conf.pySrcDir)) + makeInitFiles(conf) + } + + def main(args: Array[String]): Unit = { + val conf = args.head.parseJson.convertTo[CodegenConfig] + clean(conf.packageDir) + rGen(conf) + pyGen(conf) + } + +} + diff --git a/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala new file mode 100644 index 0000000000..049eb1bb8f --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala @@ -0,0 +1,80 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import spray.json.{DefaultJsonProtocol, RootJsonFormat} + + +case class CodegenConfig(name: String, + jarName: Option[String], + topDir: String, + targetDir: String, + version: String, + pythonizedVersion: String, + rVersion: String, + packageName: String) { + def generatedDir: File = new File(targetDir, "generated") + def packageDir: File = new File(generatedDir, "package") + def srcDir: File = new File(generatedDir, "src") + def testDir: File = new File(generatedDir, "test") + def docDir: File = new File(generatedDir, "doc") + def testDataDir: File = new File(generatedDir, "test-data") + + //Python Codegen Constant + def pySrcDir: File = new File(srcDir, "python") + def pyPackageDir: File = new File(packageDir, "python") + def pyTestDir: File = new File(testDir, "python") + def pySrcOverrideDir: File = new File(topDir, "src/main/python") + def pyTestOverrideDir: File = new File(topDir, "src/test/python") + + //R Codegen Constants + def rSrcRoot: File = new File(srcDir, "R") + def rSrcDir: File = new File(rSrcRoot, "mmlspark/R") + def rPackageDir: File = new File(packageDir, "R") + def rTestDir: File = new File(rSrcRoot, "mmlspark/tests") + def rTestOverrideDir: File = new File(topDir, "src/test/R") + def rSrcOverrideDir: File = new File(topDir, "src/main/R") + + //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") + def internalPrefix: String = "_" + def scopeDepth: String = " " * 4 + + def copyrightLines: String = + s"""|# Copyright (C) Microsoft Corporation. All rights reserved. + |# Licensed under the MIT License. See LICENSE in project root for information. + |""".stripMargin + + // The __init__.py file + def packageHelp(importString: String): String = { + s"""|$copyrightLines + | + |"\"" + |MMLSpark is an ecosystem of tools aimed towards expanding the distributed computing framework + |Apache Spark in several new directions. MMLSpark adds many deep learning and data science tools to the Spark + |ecosystem, including seamless integration of Spark Machine Learning pipelines with + |Microsoft Cognitive Toolkit (CNTK), LightGBM and OpenCV. These tools enable powerful and + |highly-scalable predictive and analytical models for a variety of datasources. + | + |MMLSpark also brings new networking capabilities to the Spark Ecosystem. With the HTTP on Spark project, + |users can embed any web service into their SparkML models. In this vein, MMLSpark provides easy to use SparkML + |transformers for a wide variety of Microsoft Cognitive Services. For production grade deployment, + |the Spark Serving project enables high throughput, sub-millisecond latency web services, + |backed by your Spark cluster. + | + |MMLSpark requires Scala 2.11, Spark 2.4+, and Python 3.5+. + |"\"" + | + |__version__ = "$pythonizedVersion" + |__spark_package_version__ = "$version" + | + |$importString + |""".stripMargin + } +} + +object CodegenConfigProtocol extends DefaultJsonProtocol { + implicit val CCFormat: RootJsonFormat[CodegenConfig] = jsonFormat8(CodegenConfig.apply) +} diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/GenerationUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala rename to core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala index 1ee75a98c7..5629e6ef66 100644 --- a/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/codegen/Wrappable.scala @@ -375,11 +375,11 @@ trait PythonWrappable extends BaseWrappable { """.stripMargin } - def makePyFile(): Unit = { + def makePyFile(conf: CodegenConfig): Unit = { val importPath = this.getClass.getName.split(".".toCharArray).dropRight(1) val srcFolders = importPath.mkString(".") .replaceAllLiterally("com.microsoft.ml.spark", "mmlspark").split(".".toCharArray) - val srcDir = FileUtilities.join((Seq(Config.PySrcDir.toString) ++ srcFolders.toSeq): _*) + val srcDir = FileUtilities.join((Seq(conf.pySrcDir.toString) ++ srcFolders.toSeq): _*) srcDir.mkdirs() Files.write( FileUtilities.join(srcDir, pyClassName + ".py").toPath, @@ -500,10 +500,10 @@ trait RWrappable extends BaseWrappable { } - def makeRFile(): Unit = { - Config.RSrcDir.mkdirs() + def makeRFile(conf: CodegenConfig): Unit = { + conf.rSrcDir.mkdirs() Files.write( - FileUtilities.join(Config.RSrcDir, rFuncName + ".R").toPath, + FileUtilities.join(conf.rSrcDir, rFuncName + ".R").toPath, rClass().getBytes(StandardCharsets.UTF_8)) } diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Metrics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala b/core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/contracts/Params.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/FileUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java b/core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java rename to core/src/main/scala/com/microsoft/ml/spark/core/env/NativeLoader.java diff --git a/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/env/StreamUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/metrics/MetricUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/BinaryFileSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/Categoricals.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/DatasetExtensions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/ImageSchemaUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SchemaConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkBindings.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/schema/SparkSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala b/core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/serialize/ComplexParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala b/core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/spark/FluentAPI.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/AsyncUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/CastUtilities.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala index 297dba1de6..db8e39cd03 100644 --- a/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala @@ -6,7 +6,7 @@ package com.microsoft.ml.spark.core.utils import java.net.InetAddress import org.apache.http.conn.util.InetAddressUtils -import org.apache.spark.lightgbm.BlockManagerUtils +import org.apache.spark.injections.BlockManagerUtils import org.apache.spark.sql.{Dataset, SparkSession} import org.slf4j.Logger diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ContextObjectInputStream.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala new file mode 100644 index 0000000000..7bbe1b1d0c --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/FaultToleranceUtils.scala @@ -0,0 +1,33 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +import scala.concurrent.duration.Duration +import scala.concurrent.{Await, ExecutionContext, Future} + +object FaultToleranceUtils { + def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ + try { + Await.result(Future(f)(ExecutionContext.global), timeout) + } catch { + case e: Exception if times >= 1 => + print(s"Received exception on call, retrying: $e") + retryWithTimeout(times-1, timeout)(f) + } + } + + val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) + + def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ + try { + f + } catch { + case e: Exception if times.nonEmpty => + println(s"Received exception on call, retrying: $e") + Thread.sleep(times.head) + retryWithTimeout(times.tail)(f) + } + } + +} diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala similarity index 72% rename from src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala index dba98c4a59..478631f620 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/JarLoadingUtils.scala @@ -7,9 +7,8 @@ import java.lang.reflect.Modifier import com.microsoft.ml.spark.codegen.Wrappable import org.sparkproject.guava.reflect.ClassPath - import scala.collection.JavaConverters._ -import scala.reflect.{ClassTag, _} +import scala.reflect.{ClassTag, classTag} /** Contains logic for loading classes. */ object JarLoadingUtils { @@ -41,22 +40,25 @@ object JarLoadingUtils { AllClasses.filter(classOf[Wrappable].isAssignableFrom(_)) } - def instantiateServices[T: ClassTag](instantiate: Class[_] => Any): List[T] = { + def instantiateServices[T: ClassTag](instantiate: Class[_] => Any, jarName: Option[String]): List[T] = { AllClasses .filter(classTag[T].runtimeClass.isAssignableFrom(_)) + .filter(c => jarName.forall(c.getResource(c.getSimpleName + ".class").toString.contains(_))) .filter(clazz => !Modifier.isAbstract(clazz.getModifiers)) .map(instantiate(_)).asInstanceOf[List[T]] } - def instantiateServices[T: ClassTag]: List[T] = instantiateServices[T] { + def instantiateServices[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]({ clazz: Class[_] => clazz.getConstructor().newInstance() - } + }, jarName) - def instantiateObjects[T: ClassTag]: List[T] = instantiateServices[T] { clazz: Class[_] => { - val cons = clazz.getDeclaredConstructors()(0) - cons.setAccessible(true) - cons.newInstance() - }} + def instantiateObjects[T: ClassTag](jarName: Option[String] = None): List[T] = instantiateServices[T]( + { clazz: Class[_] => { + val cons = clazz.getDeclaredConstructors()(0) + cons.setAccessible(true) + cons.newInstance() + } + }, + jarName) } - diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/ModelEquality.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala new file mode 100644 index 0000000000..80c4560fe8 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/core/utils/OsUtils.scala @@ -0,0 +1,8 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.core.utils + +object OsUtils { + val IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0 +} diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/RowUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala rename to core/src/main/scala/com/microsoft/ml/spark/core/utils/StopWatch.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CleanMissingData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/CountSelector.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/DataConversion.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/Featurize.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/IndexToValue.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/ValueIndexerModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/MultiNGram.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/PageSplitter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt b/core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt rename to core/src/main/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizer.txt diff --git a/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/ResizeImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt b/core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt rename to core/src/main/scala/com/microsoft/ml/spark/image/UnrollImage.txt diff --git a/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala b/core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/IOImplicits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/Binary.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileFormat.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/BinaryFileReader.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala b/core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/binary/KeyValueReaderIterator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Clients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPClients.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPSchema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/HTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/Parsers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/PortForwarding.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SharedVariable.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/http/SimpleHTTPTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/image/ImageUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala b/core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala rename to core/src/main/scala/com/microsoft/ml/spark/io/powerbi/PowerBIWriter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala b/core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala rename to core/src/main/scala/com/microsoft/ml/spark/isolationforest/IsolationForest.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/BreezeUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/LIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/LIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/Superpixel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/SuperpixelTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala b/core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala rename to core/src/main/scala/com/microsoft/ml/spark/lime/TextLIME.scala diff --git a/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala b/core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala rename to core/src/main/scala/com/microsoft/ml/spark/logging/BasicLogging.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BallTree.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/BoundedPriorityQueue.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/ConditionalKNN.scala diff --git a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala similarity index 98% rename from src/main/scala/com/microsoft/ml/spark/nn/KNN.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala index a4c3973a79..2acde7942b 100644 --- a/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/nn/KNN.scala @@ -84,7 +84,7 @@ class KNNModel(val uid: String) extends Model[KNNModel] private var broadcastedModelOption: Option[Broadcast[BallTree[_]]] = None val ballTree = new BallTreeParam(this, "ballTree", - "the ballTree model used for perfoming queries", { _ => true }) + "the ballTree model used for performing queries", { _ => true }) def getBallTree: BallTree[_] = $(ballTree) diff --git a/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala b/core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala rename to core/src/main/scala/com/microsoft/ml/spark/nn/Schemas.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingEvaluator.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSplit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SAR.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala b/core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/SARModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt b/core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt rename to core/src/main/scala/com/microsoft/ml/spark/recommendation/recommendation.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Batchers.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Cacher.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/ClassBalancer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/DropColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/EnsembleByKey.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Explode.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Explode.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Lambda.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MiniBatchTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/MultiColumnAdapter.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala index 0e05283c7b..8a3cc7a0fa 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala @@ -19,6 +19,35 @@ import scala.concurrent.blocking object PartitionConsolidator extends DefaultParamsReadable[PartitionConsolidator] +class PartitionConsolidator(val uid: String) + extends Transformer with HTTPParams with HasInputCol + with HasOutputCol + with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("PartitionConsolidator")) + + val consolidatorHolder = SharedSingleton { + new Consolidator[Row]() + } + + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + dataset.toDF().mapPartitions { it => + if (it.hasNext) { + consolidatorHolder.get.registerAndReceive(it).flatten + } else { + Iterator() + } + }(RowEncoder(dataset.schema)) + }) + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = schema +} + class Consolidator[T] { val buffer = new LinkedBlockingQueue[T]() @@ -108,36 +137,8 @@ class Consolidator[T] { } -class PartitionConsolidator(val uid: String) - extends Transformer with HTTPParams with HasInputCol - with HasOutputCol - with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("PartitionConsolidator")) - - val consolidatorHolder = SharedSingleton { - new Consolidator[Row]() - } - - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - dataset.toDF().mapPartitions { it => - if (it.hasNext) { - consolidatorHolder.get.registerAndReceive(it).flatten - } else { - Iterator() - } - }(RowEncoder(dataset.schema)) - }) - } - - override def copy(extra: ParamMap): Transformer = defaultCopy(extra) - - override def transformSchema(schema: StructType): StructType = schema -} - trait LocalAggregator[T] { def prep(iter: Iterator[Row]): T + def merge(ts: Seq[T]): T } diff --git a/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/RenameColumn.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/Repartition.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SelectColumns.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/StratifiedRepartition.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt b/core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt rename to core/src/main/scala/com/microsoft/ml/spark/stages/SummarizeData.txt diff --git a/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/TextPreprocessor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/Timer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/Timer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UDFTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala index be12d2dcee..889d1d8522 100644 --- a/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/stages/UnicodeNormalize.scala @@ -1,79 +1,79 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.stages - -import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} -import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.functions.udf - -import java.text.Normalizer -import com.microsoft.ml.spark.codegen.Wrappable -import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} -import com.microsoft.ml.spark.logging.BasicLogging -import org.apache.spark.sql.types.{StringType, StructField, StructType} - -object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] - -/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. - */ -class UnicodeNormalize(val uid: String) extends Transformer - with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { - logClass() - - def this() = this(Identifiable.randomUID("UnicodeNormalize")) - - val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") - - /** @group getParam */ - def getForm: String = get(form).getOrElse("NFKD") - - /** @group setParam */ - def setForm(value: String): this.type = { - // check input value - Normalizer.Form.valueOf(getForm) - - set("form", value) - } - - val lower = new BooleanParam(this, "lower", "Lowercase text") - - /** @group getParam */ - def getLower: Boolean = get(lower).getOrElse(true) - - /** @group setParam */ - def setLower(value: Boolean): this.type = set("lower", value) - - /** @param dataset - The input dataset, to be transformed - * @return The DataFrame that results from column selection - */ - override def transform(dataset: Dataset[_]): DataFrame = { - logTransform[DataFrame]({ - val inputIndex = dataset.columns.indexOf(getInputCol) - - require(inputIndex != -1, s"Input column $getInputCol does not exist") - - val normalizeFunc = (value: String) => - if (value == null) null - else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) - - val f = if (getLower) - (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull - else - normalizeFunc - - val textMapper = udf(f) - - dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) - }) - } - - def transformSchema(schema: StructType): StructType = { - schema.add(StructField(getOutputCol, StringType)) - } - - def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) - -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.stages + +import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.functions.udf + +import java.text.Normalizer +import com.microsoft.ml.spark.codegen.Wrappable +import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol} +import com.microsoft.ml.spark.logging.BasicLogging +import org.apache.spark.sql.types.{StringType, StructField, StructType} + +object UnicodeNormalize extends ComplexParamsReadable[UnicodeNormalize] + +/** UnicodeNormalize takes a dataframe and normalizes the unicode representation. + */ +class UnicodeNormalize(val uid: String) extends Transformer + with HasInputCol with HasOutputCol with Wrappable with ComplexParamsWritable with BasicLogging { + logClass() + + def this() = this(Identifiable.randomUID("UnicodeNormalize")) + + val form = new Param[String](this, "form", "Unicode normalization form: NFC, NFD, NFKC, NFKD") + + /** @group getParam */ + def getForm: String = get(form).getOrElse("NFKD") + + /** @group setParam */ + def setForm(value: String): this.type = { + // check input value + Normalizer.Form.valueOf(getForm) + + set("form", value) + } + + val lower = new BooleanParam(this, "lower", "Lowercase text") + + /** @group getParam */ + def getLower: Boolean = get(lower).getOrElse(true) + + /** @group setParam */ + def setLower(value: Boolean): this.type = set("lower", value) + + /** @param dataset - The input dataset, to be transformed + * @return The DataFrame that results from column selection + */ + override def transform(dataset: Dataset[_]): DataFrame = { + logTransform[DataFrame]({ + val inputIndex = dataset.columns.indexOf(getInputCol) + + require(inputIndex != -1, s"Input column $getInputCol does not exist") + + val normalizeFunc = (value: String) => + if (value == null) null + else Normalizer.normalize(value, Normalizer.Form.valueOf(getForm)) + + val f = if (getLower) + (value: String) => Option(value).map(s => normalizeFunc(s.toLowerCase)).orNull + else + normalizeFunc + + val textMapper = udf(f) + + dataset.withColumn(getOutputCol, textMapper(dataset(getInputCol)).as(getOutputCol)) + }) + } + + def transformSchema(schema: StructType): StructType = { + schema.add(StructField(getOutputCol, StringType)) + } + + def copy(extra: ParamMap): UnicodeNormalize = defaultCopy(extra) + +} diff --git a/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/stages/udfs.scala rename to core/src/main/scala/com/microsoft/ml/spark/stages/udfs.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainedModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala b/core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/AutoTrainer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputeModelStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt b/core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/ComputePerInstanceStatistics.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt b/core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt rename to core/src/main/scala/com/microsoft/ml/spark/train/TrainRegressor.txt diff --git a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala similarity index 92% rename from src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala rename to core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala index ee0ba74dd4..6d0564abb4 100644 --- a/src/main/scala/org/apache/spark/lightgbm/BlockManagerUtils.scala +++ b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala @@ -1,13 +1,14 @@ // Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. -package org.apache.spark.lightgbm +package org.apache.spark.injections import org.apache.spark.sql.Dataset import org.apache.spark.storage.BlockManager object BlockManagerUtils { /** Returns the block manager from the dataframe's spark context. + * * @param data The dataframe to get the block manager from. * @return The block manager. */ diff --git a/src/main/scala/org/apache/spark/injections/RegressionUtils.scala b/core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/RegressionUtils.scala rename to core/src/main/scala/org/apache/spark/injections/RegressionUtils.scala diff --git a/src/main/scala/org/apache/spark/injections/SConf.scala b/core/src/main/scala/org/apache/spark/injections/SConf.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/SConf.scala rename to core/src/main/scala/org/apache/spark/injections/SConf.scala diff --git a/src/main/scala/org/apache/spark/injections/UDFUtils.scala b/core/src/main/scala/org/apache/spark/injections/UDFUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/injections/UDFUtils.scala rename to core/src/main/scala/org/apache/spark/injections/UDFUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala b/core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala rename to core/src/main/scala/org/apache/spark/ml/ComplexParamsSerializer.scala diff --git a/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/LimeNamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala b/core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/NamespaceInjections.scala rename to core/src/main/scala/org/apache/spark/ml/NamespaceInjections.scala diff --git a/src/main/scala/org/apache/spark/ml/Ranker.scala b/core/src/main/scala/org/apache/spark/ml/Ranker.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Ranker.scala rename to core/src/main/scala/org/apache/spark/ml/Ranker.scala diff --git a/src/main/scala/org/apache/spark/ml/RegressorUtils.scala b/core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/RegressorUtils.scala rename to core/src/main/scala/org/apache/spark/ml/RegressorUtils.scala diff --git a/src/main/scala/org/apache/spark/ml/Serializer.scala b/core/src/main/scala/org/apache/spark/ml/Serializer.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/Serializer.scala rename to core/src/main/scala/org/apache/spark/ml/Serializer.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.scala diff --git a/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt b/core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt similarity index 100% rename from src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt rename to core/src/main/scala/org/apache/spark/ml/feature/FastVectorAssembler.txt diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ArrayParamMapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/BallTreeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ByteArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataFrameParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala b/core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/DataTypeParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EstimatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala b/core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/EvaluatorParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/JsonEncodableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/MapParam.scala b/core/src/main/scala/org/apache/spark/ml/param/MapParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/MapParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/MapParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala b/core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/ParamSpaceParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PipelineStageParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/PythonWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala b/core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/RWrappableParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TransformerParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TransformerParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/TypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UDPyFParam.scala diff --git a/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala b/core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala rename to core/src/main/scala/org/apache/spark/ml/param/UntypedArrayParam.scala diff --git a/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala b/core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala rename to core/src/main/scala/org/apache/spark/ml/recommendation/RecommendationHelper.scala diff --git a/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala b/core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala similarity index 100% rename from src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala rename to core/src/main/scala/org/apache/spark/ml/source/image/PatchedImageFileFormat.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/DistributedHTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/HTTPSource.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/ServingUDFs.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSinkV2.scala diff --git a/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala b/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala rename to core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/HTTPSourceV2.scala diff --git a/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/MetadataUtilities.scala diff --git a/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala b/core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala similarity index 100% rename from src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala rename to core/src/main/scala/org/apache/spark/sql/types/injections/OptimizedCKNNFitting.scala diff --git a/src/test/R/testthat.R b/core/src/test/R/testthat.R similarity index 100% rename from src/test/R/testthat.R rename to core/src/test/R/testthat.R diff --git a/src/test/R/testthat/setup-spark.R b/core/src/test/R/testthat/setup-spark.R similarity index 100% rename from src/test/R/testthat/setup-spark.R rename to core/src/test/R/testthat/setup-spark.R diff --git a/src/test/R/testthat/test-basic.R b/core/src/test/R/testthat/test-basic.R similarity index 100% rename from src/test/R/testthat/test-basic.R rename to core/src/test/R/testthat/test-basic.R diff --git a/src/test/python/LICENSE.txt b/core/src/test/python/LICENSE.txt similarity index 100% rename from src/test/python/LICENSE.txt rename to core/src/test/python/LICENSE.txt diff --git a/src/test/python/MANIFEST.in b/core/src/test/python/MANIFEST.in similarity index 100% rename from src/test/python/MANIFEST.in rename to core/src/test/python/MANIFEST.in diff --git a/src/main/python/mmlspark/lightgbm/__init__.py b/core/src/test/python/__init__.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/__init__.py rename to core/src/test/python/__init__.py diff --git a/src/main/python/mmlspark/nn/__init__.py b/core/src/test/python/mmlsparktest/__init__.py similarity index 100% rename from src/main/python/mmlspark/nn/__init__.py rename to core/src/test/python/mmlsparktest/__init__.py diff --git a/src/main/python/mmlspark/opencv/__init__.py b/core/src/test/python/mmlsparktest/cyber/__init__.py similarity index 100% rename from src/main/python/mmlspark/opencv/__init__.py rename to core/src/test/python/mmlsparktest/cyber/__init__.py diff --git a/src/main/python/mmlspark/plot/__init__.py b/core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py similarity index 100% rename from src/main/python/mmlspark/plot/__init__.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_collaborative_filtering.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py b/core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py rename to core/src/test/python/mmlsparktest/cyber/anamoly/test_complement_access.py diff --git a/src/test/python/mmlsparktest/cyber/explain_tester.py b/core/src/test/python/mmlsparktest/cyber/explain_tester.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/explain_tester.py rename to core/src/test/python/mmlsparktest/cyber/explain_tester.py diff --git a/src/main/python/mmlspark/stages/__init__.py b/core/src/test/python/mmlsparktest/cyber/feature/__init__.py similarity index 100% rename from src/main/python/mmlspark/stages/__init__.py rename to core/src/test/python/mmlsparktest/cyber/feature/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_indexers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_indexers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_indexers.py diff --git a/src/test/python/mmlsparktest/cyber/feature/test_scalers.py b/core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/test_scalers.py rename to core/src/test/python/mmlsparktest/cyber/feature/test_scalers.py diff --git a/src/main/python/mmlspark/vw/__init__.py b/core/src/test/python/mmlsparktest/cyber/utils/__init__.py similarity index 100% rename from src/main/python/mmlspark/vw/__init__.py rename to core/src/test/python/mmlsparktest/cyber/utils/__init__.py diff --git a/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py b/core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py rename to core/src/test/python/mmlsparktest/cyber/utils/test_spark_utils.py diff --git a/src/test/__init__.py b/core/src/test/python/mmlsparktest/nn/__init__.py similarity index 100% rename from src/test/__init__.py rename to core/src/test/python/mmlsparktest/nn/__init__.py diff --git a/src/test/python/mmlsparktest/nn/test_ball_tree.py b/core/src/test/python/mmlsparktest/nn/test_ball_tree.py similarity index 100% rename from src/test/python/mmlsparktest/nn/test_ball_tree.py rename to core/src/test/python/mmlsparktest/nn/test_ball_tree.py diff --git a/src/test/python/__init__.py b/core/src/test/python/mmlsparktest/recommendation/__init__.py similarity index 100% rename from src/test/python/__init__.py rename to core/src/test/python/mmlsparktest/recommendation/__init__.py diff --git a/src/test/python/mmlsparktest/recommendation/test_ranking.py b/core/src/test/python/mmlsparktest/recommendation/test_ranking.py similarity index 100% rename from src/test/python/mmlsparktest/recommendation/test_ranking.py rename to core/src/test/python/mmlsparktest/recommendation/test_ranking.py diff --git a/src/test/python/mmlsparktest/spark.py b/core/src/test/python/mmlsparktest/spark.py similarity index 100% rename from src/test/python/mmlsparktest/spark.py rename to core/src/test/python/mmlsparktest/spark.py diff --git a/src/test/python/setup.py b/core/src/test/python/setup.py similarity index 100% rename from src/test/python/setup.py rename to core/src/test/python/setup.py diff --git a/src/test/resources/audio1.txt b/core/src/test/resources/audio1.txt similarity index 100% rename from src/test/resources/audio1.txt rename to core/src/test/resources/audio1.txt diff --git a/src/test/resources/audio1.wav b/core/src/test/resources/audio1.wav similarity index 100% rename from src/test/resources/audio1.wav rename to core/src/test/resources/audio1.wav diff --git a/src/test/resources/audio2.txt b/core/src/test/resources/audio2.txt similarity index 100% rename from src/test/resources/audio2.txt rename to core/src/test/resources/audio2.txt diff --git a/src/test/resources/audio2.wav b/core/src/test/resources/audio2.wav similarity index 100% rename from src/test/resources/audio2.wav rename to core/src/test/resources/audio2.wav diff --git a/src/test/resources/audio3.mp3 b/core/src/test/resources/audio3.mp3 similarity index 100% rename from src/test/resources/audio3.mp3 rename to core/src/test/resources/audio3.mp3 diff --git a/src/test/resources/audio3.txt b/core/src/test/resources/audio3.txt similarity index 100% rename from src/test/resources/audio3.txt rename to core/src/test/resources/audio3.txt diff --git a/src/test/resources/audio4.txt b/core/src/test/resources/audio4.txt similarity index 100% rename from src/test/resources/audio4.txt rename to core/src/test/resources/audio4.txt diff --git a/src/test/resources/benchmarks/benchmarkBasicDataTypes.json b/core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkBasicDataTypes.json rename to core/src/test/resources/benchmarks/benchmarkBasicDataTypes.json diff --git a/src/test/resources/benchmarks/benchmarkDate.json b/core/src/test/resources/benchmarks/benchmarkDate.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkDate.json rename to core/src/test/resources/benchmarks/benchmarkDate.json diff --git a/src/test/resources/benchmarks/benchmarkNoOneHot.json b/core/src/test/resources/benchmarks/benchmarkNoOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkNoOneHot.json rename to core/src/test/resources/benchmarks/benchmarkNoOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkOneHot.json b/core/src/test/resources/benchmarks/benchmarkOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkOneHot.json rename to core/src/test/resources/benchmarks/benchmarkOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkString.json b/core/src/test/resources/benchmarks/benchmarkString.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkString.json rename to core/src/test/resources/benchmarks/benchmarkString.json diff --git a/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json b/core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringIndexOneHot.json rename to core/src/test/resources/benchmarks/benchmarkStringIndexOneHot.json diff --git a/src/test/resources/benchmarks/benchmarkStringMissing.json b/core/src/test/resources/benchmarks/benchmarkStringMissing.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkStringMissing.json rename to core/src/test/resources/benchmarks/benchmarkStringMissing.json diff --git a/src/test/resources/benchmarks/benchmarkVectors.json b/core/src/test/resources/benchmarks/benchmarkVectors.json similarity index 100% rename from src/test/resources/benchmarks/benchmarkVectors.json rename to core/src/test/resources/benchmarks/benchmarkVectors.json diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTrainClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyTuneHyperparameters.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv b/core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv rename to core/src/test/resources/benchmarks/benchmarks_VerifyVowpalWabbitRegressor.csv diff --git a/src/test/resources/demoUsage.csv.gz b/core/src/test/resources/demoUsage.csv.gz similarity index 100% rename from src/test/resources/demoUsage.csv.gz rename to core/src/test/resources/demoUsage.csv.gz diff --git a/src/test/resources/dialogue.mp3 b/core/src/test/resources/dialogue.mp3 similarity index 100% rename from src/test/resources/dialogue.mp3 rename to core/src/test/resources/dialogue.mp3 diff --git a/src/test/resources/lily.wav b/core/src/test/resources/lily.wav similarity index 100% rename from src/test/resources/lily.wav rename to core/src/test/resources/lily.wav diff --git a/src/test/resources/mark.wav b/core/src/test/resources/mark.wav similarity index 100% rename from src/test/resources/mark.wav rename to core/src/test/resources/mark.wav diff --git a/src/test/resources/sim_count1.csv.gz b/core/src/test/resources/sim_count1.csv.gz similarity index 100% rename from src/test/resources/sim_count1.csv.gz rename to core/src/test/resources/sim_count1.csv.gz diff --git a/src/test/resources/sim_count3.csv.gz b/core/src/test/resources/sim_count3.csv.gz similarity index 100% rename from src/test/resources/sim_count3.csv.gz rename to core/src/test/resources/sim_count3.csv.gz diff --git a/src/test/resources/sim_jac1.csv.gz b/core/src/test/resources/sim_jac1.csv.gz similarity index 100% rename from src/test/resources/sim_jac1.csv.gz rename to core/src/test/resources/sim_jac1.csv.gz diff --git a/src/test/resources/sim_jac3.csv.gz b/core/src/test/resources/sim_jac3.csv.gz similarity index 100% rename from src/test/resources/sim_jac3.csv.gz rename to core/src/test/resources/sim_jac3.csv.gz diff --git a/src/test/resources/sim_lift1.csv.gz b/core/src/test/resources/sim_lift1.csv.gz similarity index 100% rename from src/test/resources/sim_lift1.csv.gz rename to core/src/test/resources/sim_lift1.csv.gz diff --git a/src/test/resources/sim_lift3.csv.gz b/core/src/test/resources/sim_lift3.csv.gz similarity index 100% rename from src/test/resources/sim_lift3.csv.gz rename to core/src/test/resources/sim_lift3.csv.gz diff --git a/src/test/resources/user_aff.csv.gz b/core/src/test/resources/user_aff.csv.gz similarity index 100% rename from src/test/resources/user_aff.csv.gz rename to core/src/test/resources/user_aff.csv.gz diff --git a/src/test/resources/userpred_count3_userid_only.csv.gz b/core/src/test/resources/userpred_count3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_count3_userid_only.csv.gz rename to core/src/test/resources/userpred_count3_userid_only.csv.gz diff --git a/src/test/resources/userpred_jac3_userid_only.csv.gz b/core/src/test/resources/userpred_jac3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_jac3_userid_only.csv.gz rename to core/src/test/resources/userpred_jac3_userid_only.csv.gz diff --git a/src/test/resources/userpred_lift3_userid_only.csv.gz b/core/src/test/resources/userpred_lift3_userid_only.csv.gz similarity index 100% rename from src/test/resources/userpred_lift3_userid_only.csv.gz rename to core/src/test/resources/userpred_lift3_userid_only.csv.gz diff --git a/src/test/scala/com/microsoft/ml/spark/Secrets.scala b/core/src/test/scala/com/microsoft/ml/spark/Secrets.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/Secrets.scala rename to core/src/test/scala/com/microsoft/ml/spark/Secrets.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyFindBestModel.scala diff --git a/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala b/core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala rename to core/src/test/scala/com/microsoft/ml/spark/automl/VerifyTuneHyperparameters.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala new file mode 100644 index 0000000000..b46aefd7b7 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/codegen/TestGen.scala @@ -0,0 +1,49 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.codegen + +import java.io.File + +import com.microsoft.ml.spark.codegen.CodegenConfigProtocol._ +import com.microsoft.ml.spark.core.env.FileUtilities._ +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing +import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices +import org.apache.commons.io.FileUtils +import spray.json._ + + +object TestGen { + + import CodeGenUtils._ + + def generatePythonTests(conf: CodegenConfig): Unit = { + instantiateServices[PyTestFuzzing[_]]().foreach { ltc => + try { + ltc.makePyTestFile(conf) + } catch { + case _: NotImplementedError => + println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") + } + } + } + + private def makeInitFiles(conf: CodegenConfig, packageFolder: String = ""): Unit = { + val dir = new File(new File(conf.pyTestDir, "mmlsparktest"), packageFolder) + writeFile(new File(dir, "__init__.py"), "") + dir.listFiles().filter(_.isDirectory).foreach(f => + makeInitFiles(conf, packageFolder + "/" + f.getName) + ) + } + + def main(args: Array[String]): Unit = { + val conf = args.head.parseJson.convertTo[CodegenConfig] + clean(conf.testDataDir) + clean(conf.pyTestDir) + generatePythonTests(conf) + TestBase.stopSparkSession() + FileUtils.copyDirectoryToDirectory(toDir(conf.pyTestOverrideDir), toDir(conf.pyTestDir)) + makeInitFiles(conf) + } +} diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/HashingTFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/IDFSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/NGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/OneHotEncoderSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/ml/Word2VecSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/SparkBindingsTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/TestCategoricals.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifyFastVectorAssembler.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala b/core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/schema/VerifySparkSchema.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala b/core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/serialize/ValidateComplexParamSerializer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala index faaf19398e..4b2de24973 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/SparkSessionFactory.scala @@ -47,7 +47,6 @@ object SparkSessionFactory { val sess = SparkSession.builder() .config(conf) .getOrCreate() - sess.sparkContext.setLogLevel(logLevel) sess } diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/base/TestBase.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/benchmarks/Benchmarks.scala diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala similarity index 90% rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala index 9adbad6723..9ee92739c5 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/Fuzzing.scala @@ -7,7 +7,7 @@ import java.io.File import java.nio.charset.StandardCharsets import java.nio.file.Files -import com.microsoft.ml.spark.codegen.Config +import com.microsoft.ml.spark.codegen.CodegenConfig import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.commons.io.FileUtils @@ -50,17 +50,17 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testClassName: String = this.getClass.getName.split(".".toCharArray).last - val testDataDir: File = FileUtilities.join( - Config.TestDataDir, this.getClass.getName.split(".".toCharArray).last) + def testDataDir(conf: CodegenConfig): File = FileUtilities.join( + conf.testDataDir, this.getClass.getName.split(".".toCharArray).last) - def saveDataset(df: DataFrame, name: String): Unit = { - df.write.mode("overwrite").parquet(new File(testDataDir, s"$name.parquet").toString) + def saveDataset(conf: CodegenConfig, df: DataFrame, name: String): Unit = { + df.write.mode("overwrite").parquet(new File(testDataDir(conf), s"$name.parquet").toString) } - def saveModel(model: S, name: String): Unit = { + def saveModel(conf: CodegenConfig, model: S, name: String): Unit = { model match { case writable: MLWritable => - writable.write.overwrite().save(new File(testDataDir, s"$name.model").toString) + writable.write.overwrite().save(new File(testDataDir(conf), s"$name.model").toString) case _ => throw new IllegalArgumentException(s"${model.getClass.getName} is not writable") } @@ -69,14 +69,14 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testFitting = false - def saveTestData(): Unit = { - testDataDir.mkdirs() + def saveTestData(conf: CodegenConfig): Unit = { + testDataDir(conf).mkdirs() pyTestObjects().zipWithIndex.foreach { case (to, i) => - saveModel(to.stage, s"model-$i") + saveModel(conf, to.stage, s"model-$i") if (testFitting) { - saveDataset(to.fitDF, s"fit-$i") - saveDataset(to.transDF, s"trans-$i") - to.validateDF.foreach(saveDataset(_, s"val-$i")) + saveDataset(conf, to.fitDF, s"fit-$i") + saveDataset(conf, to.transDF, s"trans-$i") + to.validateDF.foreach(saveDataset(conf, _, s"val-$i")) } } } @@ -144,9 +144,9 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality } - def makePyTestFile(): Unit = { + def makePyTestFile(conf: CodegenConfig): Unit = { spark - saveTestData() + saveTestData(conf) val generatedTests = pyTestObjects().zipWithIndex.map { case (to, i) => makePyTests(to, i) } val stage = pyTestObjects().head.stage val stageName = stage.getClass.getName.split(".".toCharArray).last @@ -159,7 +159,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality |from os.path import join |import json | - |test_data_dir = "${testDataDir.toString.replaceAllLiterally("\\", "\\\\")}" + |test_data_dir = "${testDataDir(conf).toString.replaceAllLiterally("\\", "\\\\")}" | | |class $testClassName(unittest.TestCase): @@ -180,7 +180,7 @@ trait PyTestFuzzing[S <: PipelineStage] extends TestBase with DataFrameEquality val testFolders = importPath.mkString(".") .replaceAllLiterally("com.microsoft.ml.spark", "mmlsparktest").split(".".toCharArray) - val testDir = FileUtilities.join((Seq(Config.PyTestDir.toString) ++ testFolders.toSeq): _*) + val testDir = FileUtilities.join((Seq(conf.pyTestDir.toString) ++ testFolders.toSeq): _*) testDir.mkdirs() Files.write( FileUtilities.join(testDir, "test_" + camelToSnake(testClassName) + ".py").toPath, diff --git a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala index ce573f761d..67d31910dc 100644 --- a/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/core/test/fuzzing/FuzzingTest.scala @@ -257,17 +257,17 @@ class FuzzingTest extends TestBase { // set the context loader to pick up on the jars //Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader) - private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]] + private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.instantiateObjects[MLReadable[_]]() - private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage] + private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.instantiateServices[PipelineStage]() private lazy val experimentFuzzers: List[ExperimentFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[ExperimentFuzzing[_ <: PipelineStage]]() private lazy val serializationFuzzers: List[SerializationFuzzing[_ <: PipelineStage with MLWritable]] = - JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]] + JarLoadingUtils.instantiateServices[SerializationFuzzing[_ <: PipelineStage with MLWritable]]() private lazy val pytestFuzzers: List[PyTestFuzzing[_ <: PipelineStage]] = - JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]] + JarLoadingUtils.instantiateServices[PyTestFuzzing[_ <: PipelineStage]]() } diff --git a/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala b/core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala rename to core/src/test/scala/com/microsoft/ml/spark/core/utils/VerifyClusterUtil.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCleanMissingData.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyCountSelector.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyDataConversion.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyFeaturize.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/VerifyValueIndexer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/MultiNGramSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/PageSplitterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/featurize/text/TextFeaturizerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/flaky/PartitionConsolidatorSuite.scala diff --git a/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala new file mode 100644 index 0000000000..84e516c549 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/image/ImageTestUtils.scala @@ -0,0 +1,109 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.image + +import java.io.File +import java.net.URL + +import com.microsoft.ml.spark.build.BuildInfo +import com.microsoft.ml.spark.core.env.FileUtilities +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.sql.{DataFrame, SparkSession} +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.io.IOImplicits.dfrToDfre +import org.apache.commons.io.FileUtils +import org.apache.spark.sql.functions.col + +trait ImageTestUtils extends TestBase { + + val filesRoot = BuildInfo.datasetDir.toString + val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString + val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString + val inputCol = "cntk_images" + val outputCol = "out" + val labelCol = "labels" + + val featureVectorLength = 3 * 32 * 32 + lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString + + def testModelDF(spark: SparkSession): DataFrame = { + import spark.implicits._ + spark.sparkContext.parallelize(Seq( + Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, + -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), + Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, + -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), + Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, + 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), + Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, + -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), + Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, + 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), + Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, + 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF + } + + def testImages(spark: SparkSession): DataFrame = { + val images = spark.read.image.load(imagePath) + + val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) + + unroll.transform(images).select(inputCol) + } + + def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { + import spark.implicits._ + if (outputDouble) { + List + .fill(rows)(List.fill(size)(0.0).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } else { + List + .fill(rows)(List.fill(size)(0.0.toFloat).toArray) + .zip(List.fill(rows)(0.0)) + .toDF(inputCol, labelCol) + } + } + + protected def compareToTestModel(result: DataFrame) = { + //TODO improve checks + assert(result.columns.toSet == Set(inputCol, outputCol)) + assert(result.count() == testModelDF(result.sparkSession).count()) + val max = result + .select(outputCol) + .collect() + .map(row => row.getAs[DenseVector](0).toArray.max) + .max + assert(max < 10 & max > -10) + } + + lazy val images: DataFrame = spark.read.image.load(imagePath) + .withColumnRenamed("image", inputCol) + lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) + .select(col("value.bytes").alias(inputCol)) + + lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") + lazy val groceryImages: DataFrame = spark.read.image + .option("dropInvalid", true) + .load(groceriesPath + "**") + .withColumnRenamed("image", inputCol) + + lazy val greyscaleImageLocation: String = { + val loc = "/tmp/greyscale.jpg" + val f = new File(loc) + if (f.exists()) {f.delete()} + FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) + loc + } + + lazy val greyscaleImage: DataFrame = spark + .read.image.load(greyscaleImageLocation) + .select(col("image").alias(inputCol)) + + lazy val greyscaleBinary: DataFrame = spark + .read.binary.load(greyscaleImageLocation) + .select(col("value.bytes").alias(inputCol)) + +} diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/BinaryFileReaderSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/HTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala similarity index 99% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala index 13592cec90..b611ef5158 100644 --- a/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ImageReaderSuite.scala @@ -5,7 +5,7 @@ package com.microsoft.ml.spark.io.split1 import java.io.{File, FileInputStream} -import com.microsoft.ml.spark.cognitive.OsUtils +import com.microsoft.ml.spark.core.utils.OsUtils import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.schema.ImageSchemaUtils import com.microsoft.ml.spark.core.test.base.TestBase diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/ParserSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/PowerBiSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split1/SimpleHTTPTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/ContinuousHTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/DistributedHTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala b/core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala rename to core/src/test/scala/com/microsoft/ml/spark/io/split2/HTTPv2Suite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala rename to core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala index e623605967..2ee5fd153e 100644 --- a/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/isolationforest/VerifyIsolationForest.scala @@ -86,10 +86,6 @@ class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationFo data } - test("foo"){ - new IsolationForest().makePyFile() - } - override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel diff --git a/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala new file mode 100644 index 0000000000..b58e597944 --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala @@ -0,0 +1,66 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.lime + +import breeze.linalg.{*, DenseMatrix} +import breeze.stats.distributions.Rand +import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} +import org.apache.spark.ml.linalg.DenseVector +import org.apache.spark.ml.param.DataFrameEquality +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.ml.util.MLReadable + +trait LimeTestBase extends TestBase { + + import spark.implicits._ + + lazy val nRows = 100 + lazy val d1 = 3 + lazy val d2 = 1 + + lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) + lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) + lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 + lazy val y = x * m //+ noise + + lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) + lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) + lazy val df = xRows.zip(yRows).toDF("features", "label") + + lazy val model = new LinearRegression().fit(df) + + lazy val lime = new TabularLIME() + .setModel(model) + .setInputCol("features") + .setPredictionCol(model.getPredictionCol) + .setOutputCol("out") + .setNSamples(1000) + + lazy val limeModel = lime.fit(df) +} + +class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with + DataFrameEquality with LimeTestBase { + + test("text lime usage test check") { + val results = limeModel.transform(df).select("out") + .collect().map(_.getAs[DenseVector](0)) + results.foreach(result => assert(result === new DenseVector(m.data))) + } + + override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) + + override def reader: MLReadable[_] = TabularLIME + + override def modelReader: MLReadable[_] = TabularLIMEModel +} + +class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with + DataFrameEquality with LimeTestBase { + + override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) + + override def reader: MLReadable[_] = TabularLIMEModel +} diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala similarity index 96% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala index 5d2c26e330..289720f969 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelSuite.scala @@ -7,13 +7,13 @@ import java.awt.Color import java.awt.image.BufferedImage import java.io.File -import com.microsoft.ml.spark.cntk.CNTKTestUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.image.ImageUtils import javax.imageio.ImageIO import scala.util.Random -class SuperpixelSuite extends CNTKTestUtils { +class SuperpixelSuite extends ImageTestUtils { lazy val sp1 = new Superpixel(img, 16, 130) lazy val sp2 = new Superpixel(img2, 100, 130) diff --git a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala similarity index 90% rename from src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala index 881aefed41..0c4a5b78d0 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/lime/SuperpixelTransformerSuite.scala @@ -4,12 +4,12 @@ package com.microsoft.ml.spark.lime import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.NetworkUtils +import com.microsoft.ml.spark.image.ImageTestUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import org.apache.spark.ml.util.MLReadable class SuperpixelTransformerSuite extends TransformerFuzzing[SuperpixelTransformer] - with NetworkUtils with FileReaderUtils { + with ImageTestUtils with FileReaderUtils { lazy val spt: SuperpixelTransformer = new SuperpixelTransformer().setInputCol(inputCol) test("basic functionality"){ diff --git a/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala b/core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/lime/TextLIMESuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala index 641afac626..e72432bd34 100644 --- a/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/nbtest/DatabricksUtilities.scala @@ -42,9 +42,7 @@ object DatabricksUtilities extends HasHttpClient { val Folder = s"/MMLSparkBuild/build_${BuildInfo.version}" // MMLSpark info - val TruncatedScalaVersion: String = BuildInfo.scalaVersion - .split(".".toCharArray.head).dropRight(1).mkString(".") - val Version = s"com.microsoft.ml.spark:${BuildInfo.name}_$TruncatedScalaVersion:${BuildInfo.version}" + val Version = s"com.microsoft.ml.spark:mmlspark:${BuildInfo.version}" val Repository = "https://mmlspark.azureedge.net/maven" val Libraries: String = List( @@ -59,7 +57,7 @@ object DatabricksUtilities extends HasHttpClient { val TimeoutInMillis: Int = 40 * 60 * 1000 val NotebookFiles: Array[File] = Option( - FileUtilities.join(BuildInfo.baseDirectory, "notebooks", "samples").getCanonicalFile.listFiles() + FileUtilities.join(BuildInfo.baseDirectory.getParent, "notebooks").getCanonicalFile.listFiles() ).get val ParallizableNotebooks = NotebookFiles.filterNot(_.getName.contains("Vowpal")) diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/NotebookTests.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala b/core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala rename to core/src/test/scala/com/microsoft/ml/spark/nbtest/SprayUtilities.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/BallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/ConditionalBallTreeTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala b/core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala rename to core/src/test/scala/com/microsoft/ml/spark/nn/KNNTest.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingEvaluatorSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTestBase.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RankingTrainValidationSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/RecommendationIndexerSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/recommendation/SARSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/BatchIteratorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/CacherSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ClassBalancerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala similarity index 92% rename from src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala index 1507d15250..c96764cfd2 100644 --- a/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/stages/DropColumnsSuite.scala @@ -3,9 +3,8 @@ package com.microsoft.ml.spark.stages -import com.microsoft.ml.spark.codegen.Config import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.{PyTestFuzzing, TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.util.MLReadable class DropColumnsSuite extends TestBase with TransformerFuzzing[DropColumns] { diff --git a/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/EnsembleByKeySuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/ExplodeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/LambdaSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MiniBatchTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/MultiColumnAdapterSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RenameColumnSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/RepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SelectColumnsSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/StratifiedRepartitionSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/SummarizeDataSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TextPreprocessorSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/TimerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UDFTransformerSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala rename to core/src/test/scala/com/microsoft/ml/spark/stages/UnicodeNormalizeSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputeModelStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyComputePerInstanceStatistics.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala b/core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala rename to core/src/test/scala/com/microsoft/ml/spark/train/VerifyTrainRegressor.scala diff --git a/src/main/R/model_downloader.R b/deep-learning/src/main/R/model_downloader.R similarity index 100% rename from src/main/R/model_downloader.R rename to deep-learning/src/main/R/model_downloader.R diff --git a/src/main/python/mmlspark/cntk/CNTKModel.py b/deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py similarity index 100% rename from src/main/python/mmlspark/cntk/CNTKModel.py rename to deep-learning/src/main/python/mmlspark/cntk/CNTKModel.py diff --git a/src/test/python/mmlsparktest/__init__.py b/deep-learning/src/main/python/mmlspark/cntk/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/__init__.py rename to deep-learning/src/main/python/mmlspark/cntk/__init__.py diff --git a/src/main/python/mmlspark/image/ImageFeaturizer.py b/deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py similarity index 100% rename from src/main/python/mmlspark/image/ImageFeaturizer.py rename to deep-learning/src/main/python/mmlspark/image/ImageFeaturizer.py diff --git a/src/test/python/mmlsparktest/cognitive/__init__.py b/deep-learning/src/main/python/mmlspark/image/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cognitive/__init__.py rename to deep-learning/src/main/python/mmlspark/image/__init__.py diff --git a/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala b/deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala similarity index 100% rename from src/main/scala/com/microsoft/CNTK/SerializableFunction.scala rename to deep-learning/src/main/scala/com/microsoft/CNTK/SerializableFunction.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKFunctionParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/CNTKModel.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/ConversionUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/cntk/_CNTKModel.txt diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala similarity index 91% rename from src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala index 3b68d0ee50..54f890242b 100644 --- a/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/ModelDownloader.scala @@ -7,6 +7,7 @@ import java.io._ import java.net.{URI, URL} import java.util +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.{Configuration => HadoopConf} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} @@ -15,10 +16,8 @@ import org.apache.log4j.LogManager import org.apache.spark.sql.SparkSession import spray.json._ -import scala.annotation.tailrec import scala.collection.JavaConverters._ -import scala.concurrent.duration.{Duration, FiniteDuration} -import scala.concurrent.{Await, ExecutionContext, Future} +import scala.concurrent.duration.Duration /** Abstract representation of a repository for future expansion * @@ -34,32 +33,6 @@ private[spark] abstract class Repository[S <: Schema] { } -object FaultToleranceUtils { - def retryWithTimeout[T](times: Int, timeout: Duration)(f: => T): T ={ - try { - Await.result(Future(f)(ExecutionContext.global), timeout) - } catch { - case e: Exception if times >= 1 => - print(s"Received exception on call, retrying: $e") - retryWithTimeout(times-1, timeout)(f) - } - } - - val Backoffs: Seq[Int] = Seq(0, 100, 200, 500) - - def retryWithTimeout[T](times: Seq[Int] = Backoffs)(f: => T): T ={ - try { - f - } catch { - case e: Exception if times.nonEmpty => - println(s"Received exception on call, retrying: $e") - Thread.sleep(times.head) - retryWithTimeout(times.tail)(f) - } - } - -} - /** Exception returned if a repo cannot find the file * * @param uri : location of the file diff --git a/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/downloader/Schema.scala diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala index 2db42e83b0..c1bb3e9e59 100644 --- a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala +++ b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.scala @@ -132,7 +132,7 @@ class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with /** @group getParam */ def getLayerNames: Array[String] = $(layerNames) - setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa->true) + setDefault(cutOutputLayers -> 1, outputCol -> (uid + "_output"), dropNa -> true) override def transform(dataset: Dataset[_]): DataFrame = { logTransform[DataFrame]({ diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt b/deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt rename to deep-learning/src/main/scala/com/microsoft/ml/spark/image/ImageFeaturizer.txt diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala index 37b4b1ad61..f848394536 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKBindingSuite.scala @@ -9,11 +9,12 @@ import com.microsoft.CNTK.CNTKExtensions._ import com.microsoft.CNTK.{SerializableFunction => CNTKFunction, _} import com.microsoft.ml.spark.core.env.StreamUtilities._ import com.microsoft.ml.spark.core.test.base.LinuxOnly +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.IOUtils import scala.collection.JavaConverters._ -class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils { +class CNTKBindingSuite extends LinuxOnly with ImageTestUtils { def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = { (0 until fvv.size.toInt).map(i => diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala index 34893a7015..8d2285be0a 100644 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/cntk/CNTKModelSuite.scala @@ -10,6 +10,7 @@ import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.base.LinuxOnly import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.image.ImageTestUtils import org.apache.commons.io.FileUtils import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.DenseVector @@ -21,7 +22,7 @@ import org.apache.spark.sql.types._ import scala.util.Random -class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzing[CNTKModel] { +class CNTKModelSuite extends LinuxOnly with ImageTestUtils with TransformerFuzzing[CNTKModel] { // TODO: Move away from getTempDirectoryPath and have TestBase provide one @@ -54,7 +55,7 @@ class CNTKModelSuite extends LinuxOnly with CNTKTestUtils with TransformerFuzzin .setOutputNodeIndex(0) } - lazy val images = testImages(spark) + override lazy val images = testImages(spark) import spark.implicits._ diff --git a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala index ee6d53933a..f67e4b82d5 100644 --- a/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/downloader/DownloaderSuite.scala @@ -7,6 +7,7 @@ import java.io.File import java.nio.file.Files import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.commons.io.FileUtils import scala.collection.JavaConverters._ diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala similarity index 81% rename from src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala index 247c7a421e..6733d1fa67 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/image/ImageFeaturizerSuite.scala @@ -8,24 +8,20 @@ import java.net.{URI, URL} import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.cntk.CNTKTestUtils import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.core.utils.ModelEquality import com.microsoft.ml.spark.downloader.{ModelDownloader, ModelSchema} import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.powerbi.PowerBIWriter import com.microsoft.ml.spark.io.split1.FileReaderUtils -import org.apache.commons.io.FileUtils import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StringType -trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { +trait TrainedCNTKModelUtils extends ImageTestUtils with FileReaderUtils { lazy val modelDir = new File(filesRoot, "CNTKModel") lazy val modelDownloader = new ModelDownloader(spark, modelDir.toURI) @@ -33,33 +29,6 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50") - lazy val images: DataFrame = spark.read.image.load(imagePath) - .withColumnRenamed("image", inputCol) - lazy val binaryImages: DataFrame = spark.read.binary.load(imagePath) - .select(col("value.bytes").alias(inputCol)) - - lazy val groceriesPath = FileUtilities.join(BuildInfo.datasetDir, "Images","Grocery") - lazy val groceryImages: DataFrame = spark.read.image - .option("dropInvalid", true) - .load(groceriesPath + "**") - .withColumnRenamed("image", inputCol) - - lazy val greyscaleImageLocation: String = { - val loc = "/tmp/greyscale.jpg" - val f = new File(loc) - if (f.exists()) {f.delete()} - FileUtils.copyURLToFile(new URL("https://mmlspark.blob.core.windows.net/datasets/LIME/greyscale.jpg"), f) - loc - } - - lazy val greyscaleImage: DataFrame = spark - .read.image.load(greyscaleImageLocation) - .select(col("image").alias(inputCol)) - - lazy val greyscaleBinary: DataFrame = spark - .read.binary.load(greyscaleImageLocation) - .select(col("value.bytes").alias(inputCol)) - def resNetModel(): ImageFeaturizer = new ImageFeaturizer() .setInputCol(inputCol) .setOutputCol(outputCol) @@ -68,7 +37,7 @@ trait NetworkUtils extends CNTKTestUtils with FileReaderUtils { } class ImageFeaturizerSuite extends TransformerFuzzing[ImageFeaturizer] - with NetworkUtils { + with TrainedCNTKModelUtils { test("Image featurizer should reproduce the CIFAR10 experiment") { print(spark) diff --git a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala similarity index 65% rename from src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala rename to deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala index e83f910e37..892ba9823d 100644 --- a/src/test/scala/com/microsoft/ml/spark/lime/LIMESuite.scala +++ b/deep-learning/src/test/scala/com/microsoft/ml/spark/lime/ImageLIMESuite.scala @@ -7,82 +7,23 @@ import java.awt.image.BufferedImage import java.io.File import java.net.URL -import breeze.linalg.{*, DenseMatrix} -import breeze.stats.distributions.Rand -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.image.{ImageFeaturizer, NetworkUtils} +import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} +import com.microsoft.ml.spark.image.{ImageFeaturizer, TrainedCNTKModelUtils} import com.microsoft.ml.spark.io.IOImplicits._ import com.microsoft.ml.spark.io.image.ImageUtils import com.microsoft.ml.spark.io.split1.FileReaderUtils import com.microsoft.ml.spark.stages.UDFTransformer import com.microsoft.ml.spark.stages.udfs.get_value_udf import org.apache.commons.io.FileUtils -import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.param.DataFrameEquality -import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.util.MLReadable import org.apache.spark.ml.{NamespaceInjections, PipelineModel} import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.{DataFrame, Row} -trait LimeTestBase extends TestBase { - - import spark.implicits._ - - lazy val nRows = 100 - lazy val d1 = 3 - lazy val d2 = 1 - - lazy val m: DenseMatrix[Double] = new DenseMatrix(d1, d2, Array(1.0, -1.0, 2.0)) - lazy val x: DenseMatrix[Double] = DenseMatrix.rand(nRows, d1, Rand.gaussian) - lazy val noise: DenseMatrix[Double] = DenseMatrix.rand(nRows, d2, Rand.gaussian) * 0.1 - lazy val y = x * m //+ noise - - lazy val xRows = x(*, ::).iterator.toSeq.map(dv => new DenseVector(dv.toArray)) - lazy val yRows = y(*, ::).iterator.toSeq.map(dv => dv(0)) - lazy val df = xRows.zip(yRows).toDF("features", "label") - - lazy val model = new LinearRegression().fit(df) - - lazy val lime = new TabularLIME() - .setModel(model) - .setInputCol("features") - .setPredictionCol(model.getPredictionCol) - .setOutputCol("out") - .setNSamples(1000) - - lazy val limeModel = lime.fit(df) -} - -class TabularLIMESuite extends EstimatorFuzzing[TabularLIME] with - DataFrameEquality with LimeTestBase { - - test("text lime usage test check") { - val results = limeModel.transform(df).select("out") - .collect().map(_.getAs[DenseVector](0)) - results.foreach(result => assert(result === new DenseVector(m.data))) - } - - override def testObjects(): Seq[TestObject[TabularLIME]] = Seq(new TestObject(lime, df)) - - override def reader: MLReadable[_] = TabularLIME - - override def modelReader: MLReadable[_] = TabularLIMEModel -} - -class TabularLIMEModelSuite extends TransformerFuzzing[TabularLIMEModel] with - DataFrameEquality with LimeTestBase { - - override def testObjects(): Seq[TestObject[TabularLIMEModel]] = Seq(new TestObject(limeModel, df)) - - override def reader: MLReadable[_] = TabularLIMEModel -} - class ImageLIMESuite extends TransformerFuzzing[ImageLIME] with - DataFrameEquality with NetworkUtils with FileReaderUtils { + DataFrameEquality with TrainedCNTKModelUtils with FileReaderUtils { lazy val greyhoundImageLocation: String = { val loc = "/tmp/greyhound.jpg" diff --git a/docs/cogsvc.md b/docs/cogsvc.md index edec95f375..949ae14c96 100644 --- a/docs/cogsvc.md +++ b/docs/cogsvc.md @@ -9,7 +9,7 @@ Azure Cognitive Services on Spark enable working with Azure’s Intelligent Services at massive scales with the Apache Spark™ distributed computing ecosystem. Cognitive Services on Spark allows users to embed general purpose and continuously improving intelligent models directly into their Apache Spark™ and SQL computations. This liberates developers from low-level networking details, so they can focus on creating intelligent, distributed applications. Each Cognitive Service acts as a SparkML transformer, so users can add services to existing SparkML pipelines. This is a great example of our [HTTP-on-Spark](http.md) capability that lets you interact with HTTP services from Apache Spark. ## Usage -To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/samples/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb). +To see an example of Cognitive Services on Spark in action, take a look at [this sample notebook](../notebooks/CognitiveServices%20-%20Celebrity%20Quote%20Analysis.ipynb). ## Cognitive Services on Apache Spark™ Currently, the following Cognitive Services are available on Apache Spark™ through MMLSpark: diff --git a/docs/datasets.md b/docs/datasets.md index 8376027f4f..595ae3d409 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -24,7 +24,7 @@ tab-separated file with 2 columns (`rating`, `text`) and 10000 rows. The contains free-form text strings in English language. You can use `mmlspark.TextFeaturizer` to convert the text into feature vectors for machine learning models ([see -example](../notebooks/samples/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)). +example](../notebooks/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb)). The example dataset is available [here](https://mmlspark.azureedge.net/datasets/BookReviewsFromAmazon10K.tsv); @@ -48,7 +48,7 @@ The example dataset is available the original dataset is available [Krizhevsky's page](https://www.cs.toronto.edu/~kriz/cifar.html). The dataset has been packaged into a gzipped tar archive. See notebook [301 - CIFAR10 CNTK CNN -Evaluation](../notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb) +Evaluation](../notebooks/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb) for an example how to extract the image data. Reference: [_Learning Multiple Layers of Features from Tiny diff --git a/docs/lightgbm.md b/docs/lightgbm.md index fed5bc3413..87d5c366f2 100644 --- a/docs/lightgbm.md +++ b/docs/lightgbm.md @@ -49,7 +49,7 @@ model = LightGBMRegressor(application='quantile', ``` For an end to end application, check out the LightGBM [notebook -example](../notebooks/samples/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb). +example](../notebooks/LightGBM%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb). ### Architecture diff --git a/docs/mmlspark-serving.md b/docs/mmlspark-serving.md index d59e3e0c58..9471644805 100644 --- a/docs/mmlspark-serving.md +++ b/docs/mmlspark-serving.md @@ -25,7 +25,7 @@ ### Jupyter Notebook Examples -- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/samples/SparkServing%20-%20Deploying%20a%20Classifier.ipynb) +- [Deploy a classifier trained on the Adult Census Dataset](../notebooks/SparkServing%20-%20Deploying%20a%20Classifier.ipynb) - More coming soon! ### Spark Serving Hello World diff --git a/docs/vw.md b/docs/vw.md index 6deaeedf08..ddb0b7f692 100644 --- a/docs/vw.md +++ b/docs/vw.md @@ -58,7 +58,7 @@ model = (VowpalWabbitRegressor(args="--holdout_off --loss_function quantile -q : Through the args parameter you can pass command line parameters to VW as documented in the [VW Wiki](https://github.com/vowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments). For an end to end application, check out the VowpalWabbit [notebook -example](../notebooks/samples/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]). +example](../notebooks/Vowpal%20Wabbit%20-%20Quantile%20Regression%20for%20Drug%20Discovery.ipynb]). ### Hyper-parameter tuning diff --git a/environment.yaml b/environment.yaml index 779ac784b2..6e004d8e77 100644 --- a/environment.yaml +++ b/environment.yaml @@ -6,6 +6,7 @@ dependencies: - python=3.6 - pyspark=3.0.1 - requests + - pip - r-base - r-dplyr - r-sparklyr diff --git a/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMClassificationModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRankerModel.py diff --git a/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py b/lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py rename to lightgbm/src/main/python/mmlspark/lightgbm/LightGBMRegressionModel.py diff --git a/src/test/python/mmlsparktest/cyber/__init__.py b/lightgbm/src/main/python/mmlspark/lightgbm/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/__init__.py rename to lightgbm/src/main/python/mmlspark/lightgbm/__init__.py diff --git a/src/main/python/mmlspark/lightgbm/mixin.py b/lightgbm/src/main/python/mmlspark/lightgbm/mixin.py similarity index 100% rename from src/main/python/mmlspark/lightgbm/mixin.py rename to lightgbm/src/main/python/mmlspark/lightgbm/mixin.py diff --git a/src/main/scala/com/microsoft/lightgbm/SWIG.scala b/lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala similarity index 100% rename from src/main/scala/com/microsoft/lightgbm/SWIG.scala rename to lightgbm/src/main/scala/com/microsoft/lightgbm/SWIG.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMBase.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMConstants.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMDelegate.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMModelMethods.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRanker.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMRegressor.txt diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala index 6fc82765b8..ccecaae33e 100644 --- a/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala +++ b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala @@ -8,7 +8,7 @@ import java.net._ import com.microsoft.ml.lightgbm._ import com.microsoft.ml.spark.core.env.StreamUtilities._ -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import com.microsoft.ml.spark.lightgbm.booster.LightGBMBooster import com.microsoft.ml.spark.lightgbm.dataset.LightGBMDataset import com.microsoft.ml.spark.lightgbm.params.{ClassifierTrainParams, TrainParams} diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/booster/LightGBMBooster.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/dataset/LightGBMDataset.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/FObjTrait.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMBoosterParam.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/LightGBMParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/params/TrainParams.scala diff --git a/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala b/lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala rename to lightgbm/src/main/scala/com/microsoft/ml/spark/lightgbm/swig/SwigUtils.scala diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMClassifier.csv diff --git a/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv b/lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv similarity index 100% rename from src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv rename to lightgbm/src/test/resources/benchmarks/benchmarks_VerifyLightGBMRegressor.csv diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split1/VerifyLightGBMClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRanker.scala diff --git a/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala b/lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala rename to lightgbm/src/test/scala/com/microsoft/ml/spark/lightgbm/split2/VerifyLightGBMRegressor.scala diff --git a/notebooks/samples/AzureSearchIndex - Met Artworks.ipynb b/notebooks/AzureSearchIndex - Met Artworks.ipynb similarity index 100% rename from notebooks/samples/AzureSearchIndex - Met Artworks.ipynb rename to notebooks/AzureSearchIndex - Met Artworks.ipynb diff --git a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb similarity index 98% rename from notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb rename to notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb index e7098605cc..4608bce764 100644 --- a/notebooks/samples/Classification - Adult Census with Vowpal Wabbit.ipynb +++ b/notebooks/Classification - Adult Census with Vowpal Wabbit.ipynb @@ -8,7 +8,7 @@ "# Classification - Adult Census using Vowpal Wabbit in MMLSpark\n", "\n", "In this example, we predict incomes from the *Adult Census* dataset using Vowpal Wabbit (VW) classifier in MMLSpark.\n", - "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/Classification%20-%20Adult%20Census.ipynb\n", + "First, we read the data and split it into train and test sets as in this [example](https://github.com/Azure/mmlspark/blob/master/notebooks/Classification%20-%20Adult%20Census.ipynb\n", ")." ] }, diff --git a/notebooks/samples/Classification - Adult Census.ipynb b/notebooks/Classification - Adult Census.ipynb similarity index 100% rename from notebooks/samples/Classification - Adult Census.ipynb rename to notebooks/Classification - Adult Census.ipynb diff --git a/notebooks/samples/Classification - Before and After MMLSpark.ipynb b/notebooks/Classification - Before and After MMLSpark.ipynb similarity index 100% rename from notebooks/samples/Classification - Before and After MMLSpark.ipynb rename to notebooks/Classification - Before and After MMLSpark.ipynb diff --git a/notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb b/notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb similarity index 100% rename from notebooks/samples/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb rename to notebooks/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb diff --git a/notebooks/samples/Cognitive Services - Overview.ipynb b/notebooks/Cognitive Services - Overview.ipynb similarity index 100% rename from notebooks/samples/Cognitive Services - Overview.ipynb rename to notebooks/Cognitive Services - Overview.ipynb diff --git a/notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb similarity index 100% rename from notebooks/samples/CognitiveServices - Celebrity Quote Analysis.ipynb rename to notebooks/CognitiveServices - Celebrity Quote Analysis.ipynb diff --git a/notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb b/notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb similarity index 100% rename from notebooks/samples/ConditionalKNN - Exploring Art Across Cultures.ipynb rename to notebooks/ConditionalKNN - Exploring Art Across Cultures.ipynb diff --git a/notebooks/samples/CyberML - Anomalous Access Detection.ipynb b/notebooks/CyberML - Anomalous Access Detection.ipynb similarity index 100% rename from notebooks/samples/CyberML - Anomalous Access Detection.ipynb rename to notebooks/CyberML - Anomalous Access Detection.ipynb diff --git a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb rename to notebooks/DeepLearning - BiLSTM Medical Entity Extraction.ipynb diff --git a/notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - CIFAR10 Convolutional Network.ipynb rename to notebooks/DeepLearning - CIFAR10 Convolutional Network.ipynb diff --git a/notebooks/samples/DeepLearning - Flower Image Classification.ipynb b/notebooks/DeepLearning - Flower Image Classification.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - Flower Image Classification.ipynb rename to notebooks/DeepLearning - Flower Image Classification.ipynb diff --git a/notebooks/samples/DeepLearning - Transfer Learning.ipynb b/notebooks/DeepLearning - Transfer Learning.ipynb similarity index 100% rename from notebooks/samples/DeepLearning - Transfer Learning.ipynb rename to notebooks/DeepLearning - Transfer Learning.ipynb diff --git a/notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb b/notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb similarity index 100% rename from notebooks/samples/HttpOnSpark - Working with Arbitrary Web APIs.ipynb rename to notebooks/HttpOnSpark - Working with Arbitrary Web APIs.ipynb diff --git a/notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb similarity index 100% rename from notebooks/samples/HyperParameterTuning - Fighting Breast Cancer.ipynb rename to notebooks/HyperParameterTuning - Fighting Breast Cancer.ipynb diff --git a/notebooks/samples/LightGBM - Overview.ipynb b/notebooks/LightGBM - Overview.ipynb similarity index 100% rename from notebooks/samples/LightGBM - Overview.ipynb rename to notebooks/LightGBM - Overview.ipynb diff --git a/notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb b/notebooks/ModelInterpretation - Snow Leopard Detection.ipynb similarity index 100% rename from notebooks/samples/ModelInterpretation - Snow Leopard Detection.ipynb rename to notebooks/ModelInterpretation - Snow Leopard Detection.ipynb diff --git a/notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb b/notebooks/OpenCV - Pipeline Image Transformations.ipynb similarity index 100% rename from notebooks/samples/OpenCV - Pipeline Image Transformations.ipynb rename to notebooks/OpenCV - Pipeline Image Transformations.ipynb diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/Regression - Flight Delays with DataCleaning.ipynb similarity index 100% rename from notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb rename to notebooks/Regression - Flight Delays with DataCleaning.ipynb diff --git a/notebooks/samples/Regression - Auto Imports.ipynb b/notebooks/Regression - Auto Imports.ipynb similarity index 100% rename from notebooks/samples/Regression - Auto Imports.ipynb rename to notebooks/Regression - Auto Imports.ipynb diff --git a/notebooks/samples/Regression - Flight Delays.ipynb b/notebooks/Regression - Flight Delays.ipynb similarity index 100% rename from notebooks/samples/Regression - Flight Delays.ipynb rename to notebooks/Regression - Flight Delays.ipynb diff --git a/notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb similarity index 100% rename from notebooks/samples/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb rename to notebooks/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb diff --git a/notebooks/samples/SparkServing - Deploying a Classifier.ipynb b/notebooks/SparkServing - Deploying a Classifier.ipynb similarity index 100% rename from notebooks/samples/SparkServing - Deploying a Classifier.ipynb rename to notebooks/SparkServing - Deploying a Classifier.ipynb diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb similarity index 100% rename from notebooks/samples/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb rename to notebooks/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb diff --git a/notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/TextAnalytics - Amazon Book Reviews.ipynb similarity index 100% rename from notebooks/samples/TextAnalytics - Amazon Book Reviews.ipynb rename to notebooks/TextAnalytics - Amazon Book Reviews.ipynb diff --git a/notebooks/samples/Vowpal Wabbit - Overview.ipynb b/notebooks/Vowpal Wabbit - Overview.ipynb similarity index 100% rename from notebooks/samples/Vowpal Wabbit - Overview.ipynb rename to notebooks/Vowpal Wabbit - Overview.ipynb diff --git a/src/main/python/mmlspark/opencv/ImageTransformer.py b/opencv/src/main/python/mmlspark/opencv/ImageTransformer.py similarity index 100% rename from src/main/python/mmlspark/opencv/ImageTransformer.py rename to opencv/src/main/python/mmlspark/opencv/ImageTransformer.py diff --git a/src/test/python/mmlsparktest/cyber/anamoly/__init__.py b/opencv/src/main/python/mmlspark/opencv/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/anamoly/__init__.py rename to opencv/src/main/python/mmlspark/opencv/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala b/opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/image/ImageSetAugmenter.scala diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/ImageTransformer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala b/opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala rename to opencv/src/main/scala/com/microsoft/ml/spark/opencv/OpenCVUtils.scala diff --git a/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ImageSetAugmenterSuite.scala diff --git a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala similarity index 97% rename from src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala index 5d05a243cc..b20b309bb0 100644 --- a/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/image/ResizeImageTransformerSuite.scala @@ -8,15 +8,15 @@ import java.net.URL import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} -import com.microsoft.ml.spark.opencv.{ImageTestUtils, ImageTransformer} +import com.microsoft.ml.spark.io.IOImplicits._ +import com.microsoft.ml.spark.opencv.{ImageTransformer, OpenCVTestUtils} +import org.apache.commons.io.FileUtils import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Row} -import com.microsoft.ml.spark.io.IOImplicits._ -import org.apache.commons.io.FileUtils class ResizeImageTransformerSuite extends TransformerFuzzing[ResizeImageTransformer] - with ImageTestUtils { + with OpenCVTestUtils { lazy val images: DataFrame = spark.read.image .option("dropInvalid", true).load(FileUtilities.join(fileLocation, "**").toString) diff --git a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala similarity index 98% rename from src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala rename to opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala index 6c7ab6dfe5..62a43aa5e9 100644 --- a/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala +++ b/opencv/src/test/scala/com/microsoft/ml/spark/opencv/ImageTransformerSuite.scala @@ -23,7 +23,7 @@ import org.opencv.imgproc.Imgproc import org.scalactic.Equality import org.scalatest.Assertion -trait ImageTestUtils { +trait OpenCVTestUtils { lazy protected val fileLocation = FileUtilities.join(BuildInfo.datasetDir, "Images", "Grocery") protected def selectTestImageBytes(images: DataFrame): Array[Byte] = { @@ -81,7 +81,7 @@ trait ImageTestUtils { } -class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUtils with DataFrameEquality { +class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot,"Images", "CIFAR").toString @@ -128,7 +128,7 @@ class UnrollImageSuite extends TransformerFuzzing[UnrollImage] with ImageTestUti } class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] - with ImageTestUtils with DataFrameEquality { + with OpenCVTestUtils with DataFrameEquality { lazy val filesRoot = BuildInfo.datasetDir lazy val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString @@ -163,7 +163,7 @@ class UnrollBinaryImageSuite extends TransformerFuzzing[UnrollBinaryImage] override def reader: UnrollBinaryImage.type = UnrollBinaryImage } -class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with ImageTestUtils { +class ImageTransformerSuite extends TransformerFuzzing[ImageTransformer] with OpenCVTestUtils { //TODO this is needed to stop the build from freezing override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { diff --git a/pipeline.yaml b/pipeline.yaml index 658419f164..bf8e8fd377 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -33,7 +33,6 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache.yml - task: AzureCLI@1 displayName: 'Style Check' inputs: @@ -128,7 +127,6 @@ jobs: pool: vmImage: ubuntu-18.04 steps: - - template: templates/ivy_cache.yml - task: AzureCLI@1 displayName: 'Get Docker Tag + Version' inputs: diff --git a/project/BlobMavenPlugin.scala b/project/BlobMavenPlugin.scala new file mode 100644 index 0000000000..de8114172e --- /dev/null +++ b/project/BlobMavenPlugin.scala @@ -0,0 +1,48 @@ +import java.io.File + +import BlobMavenPlugin.autoImport.publishBlob +import BuildUtils.{join, uploadToBlob} +import sbt._ +import Keys._ +import org.apache.ivy.core.IvyPatternHelper + +//noinspection ScalaStyle +object BlobMavenPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val publishBlob = TaskKey[Unit]("publishBlob", "publish the library to mmlspark blob") + val blobArtifactInfo = SettingKey[String]("blobArtifactInfo") + } + + import autoImport._ + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + publishBlob := { + publishM2.value + //TODO make this more general - 1.0 is a hack and not sure of a way to get this with sbt keys + val sourceArtifactName = s"${moduleName.value}_${scalaBinaryVersion.value}_1.0" + val destArtifactName = s"${moduleName.value}" + val repositoryDir = new File(new URI(Resolver.mavenLocal.root)) + val orgDirs = organization.value.split(".".toCharArray.head) + val localPackageFolder = join(repositoryDir, orgDirs ++ Seq(sourceArtifactName, version.value):_*).toString + val blobMavenFolder = (orgDirs ++ Seq(destArtifactName, version.value)).mkString("/") + uploadToBlob(localPackageFolder, blobMavenFolder, "maven") + println(blobArtifactInfo.value) + }, + blobArtifactInfo := { + s""" + |MMLSpark Build and Release Information + |--------------- + | + |### Maven Coordinates + | `${organization.value}:${moduleName.value}:${version.value}` + | + |### Maven Resolver + | `https://mmlspark.azureedge.net/maven` + |""".stripMargin + } + ) +} \ No newline at end of file diff --git a/project/CodegenPlugin.scala b/project/CodegenPlugin.scala new file mode 100644 index 0000000000..59bd294aca --- /dev/null +++ b/project/CodegenPlugin.scala @@ -0,0 +1,211 @@ +import java.io.File + +import BuildUtils.{join, runCmd, singleUploadToBlob, zipFolder} +import CondaPlugin.autoImport.{activateCondaEnv, condaEnvLocation, createCondaEnvTask} +import org.apache.commons.io.FileUtils +import sbt.Keys._ +import sbt._ +import spray.json._ + +object CodegenConfigProtocol extends DefaultJsonProtocol { + implicit val CCFormat: RootJsonFormat[CodegenConfig] = jsonFormat8(CodegenConfig.apply) +} + +import CodegenConfigProtocol._ + +case class CodegenConfig(name: String, + jarName: Option[String], + topDir: String, + targetDir: String, + version: String, + pythonizedVersion: String, + rVersion: String, + packageName: String) + +//noinspection ScalaStyle +object CodegenPlugin extends AutoPlugin { + override def trigger = allRequirements + + override def requires: Plugins = CondaPlugin + + def rCmd(activateCondaEnv: Seq[String], cmd: Seq[String], wd: File, libPath: String): Unit = { + runCmd(activateCondaEnv ++ cmd, wd, Map("R_LIBS" -> libPath, "R_USER_LIBS" -> libPath)) + } + + object autoImport { + val pythonizedVersion = settingKey[String]("Pythonized version") + val rVersion = settingKey[String]("R version") + val genPackageNamespace = settingKey[String]("genPackageNamespace") + val genTestPackageNamespace = settingKey[String]("genTestPackageNamespace") + val genJarName = settingKey[Option[String]]("genJarName") + + val targetDir = settingKey[File]("targetDir") + val codegenDir = settingKey[File]("codegenDir") + + val codegen = TaskKey[Unit]("codegen", "Generate Code") + val testgen = TaskKey[Unit]("testgen", "Generate Tests") + + val packageR = TaskKey[Unit]("packageR", "Generate roxygen docs and zip R package") + val publishR = TaskKey[Unit]("publishR", "publish R package to blob") + val testR = TaskKey[Unit]("testR", "Run testthat on R tests") + + val packagePython = TaskKey[Unit]("packagePython", "Package python sdk") + val installPipPackage = TaskKey[Unit]("installPipPackage", "install python sdk") + val publishPython = TaskKey[Unit]("publishPython", "publish python wheel") + val testPython = TaskKey[Unit]("testPython", "test python sdk") + + val mergePyCodeDir = SettingKey[File]("mergePyCodeDir") + val mergePyCode = TaskKey[Unit]("mergePyCode", "copy python code to a destination") + + val codegenArgs = settingKey[String]("codegenArgs") + } + + import autoImport._ + + override lazy val projectSettings: Seq[Setting[_]] = Seq( + publishMavenStyle := true, + codegenArgs := { + CodegenConfig( + name.value, + genJarName.value, + baseDirectory.value.getAbsolutePath, + targetDir.value.getAbsolutePath, + version.value, + pythonizedVersion.value, + rVersion.value, + genPackageNamespace.value + ).toJson.compactPrint + }, + genJarName := { + Some(artifactName.value( + ScalaVersion(scalaVersion.value, scalaBinaryVersion.value), + projectID.value, + artifact.value)) + }, + codegen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = codegenArgs.value + Def.task { + (Compile / runMain).toTask(s" com.microsoft.ml.spark.codegen.CodeGen $arg").value + } + }.value), + testgen := (Def.taskDyn { + (Compile / compile).value + (Test / compile).value + val arg = codegenArgs.value + Def.task { + (Test / runMain).toTask(s" com.microsoft.ml.spark.codegen.TestGen $arg").value + } + }.value), + pythonizedVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + ".dev1" + } else { + version.value + } + }, + rVersion := { + if (version.value.contains("-")) { + version.value.split("-".head).head + } else { + version.value + } + }, + packageR := { + createCondaEnvTask.value + codegen.value + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + val rPackageDir = join(codegenDir.value, "package", "R") + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + rCmd(activateCondaEnv.value, Seq("R", "-q", "-e", "roxygen2::roxygenise()"), rSrcDir, libPath) + rPackageDir.mkdirs() + zipFolder(rSrcDir, new File(rPackageDir, s"${name.value}-${version.value}.zip")) + }, + testR := { + packageR.value + publishLocal.value + val libPath = join(condaEnvLocation.value, "Lib", "R", "library").toString + val rSrcDir = join(codegenDir.value, "src", "R", genPackageNamespace.value) + rCmd(activateCondaEnv.value, + Seq("R", "CMD", "INSTALL", "--no-multiarch", "--with-keep.source", genPackageNamespace.value), + rSrcDir.getParentFile, libPath) + val testRunner = join("tools", "tests", "run_r_tests.R").getAbsolutePath + rCmd(activateCondaEnv.value, + Seq("Rscript", testRunner), rSrcDir, libPath) + }, + publishR := { + codegen.value + packageR.value + val rPackageDir = join(codegenDir.value, "package", "R") + val rPackage = rPackageDir.listFiles().head + singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr") + }, + packagePython := { + codegen.value + createCondaEnvTask.value + val destPyDir = join(targetDir.value, "classes", genPackageNamespace.value) + val packageDir = join(codegenDir.value, "package", "python").absolutePath + val pythonSrcDir = join(codegenDir.value, "src", "python") + if (destPyDir.exists()) FileUtils.forceDelete(destPyDir) + val sourcePyDir = join(pythonSrcDir.getAbsolutePath, genPackageNamespace.value) + FileUtils.copyDirectory(sourcePyDir, destPyDir) + runCmd( + activateCondaEnv.value ++ + Seq(s"python", "setup.py", "bdist_wheel", "--universal", "-d", packageDir), + pythonSrcDir) + }, + installPipPackage := { + packagePython.value + publishLocal.value + runCmd( + activateCondaEnv.value ++ Seq("pip", "install", "-I", + s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl"), + join(codegenDir.value, "package", "python")) + }, + publishPython := { + publishLocal.value + packagePython.value + val fn = s"${name.value.replace("-", "_")}-${pythonizedVersion.value}-py2.py3-none-any.whl" + singleUploadToBlob( + join(codegenDir.value, "package", "python", fn).toString, + version.value + "/" + fn, "pip") + }, + mergePyCode := { + val srcDir = join(codegenDir.value, "src", "python", genPackageNamespace.value) + val destDir = join(mergePyCodeDir.value, "src", "python", genPackageNamespace.value) + FileUtils.copyDirectory(srcDir, destDir) + }, + testPython := { + installPipPackage.value + testgen.value + runCmd( + activateCondaEnv.value ++ Seq("python", + "-m", + "pytest", + s"--cov=${genPackageNamespace.value}", + "--junitxml=../../../../python-test-results.xml", + "--cov-report=xml", + genTestPackageNamespace.value + ), + new File(codegenDir.value, "test/python/") + ) + }, + targetDir := { + artifactPath.in(packageBin).in(Compile).value.getParentFile + }, + mergePyCodeDir := { + join(baseDirectory.value.getParent, "target", "scala-2.12", "sbt-1.0", "generated") + }, + codegenDir := { + join(targetDir.value, "generated") + }, + genPackageNamespace := { + "mmlspark" + }, + genTestPackageNamespace := { + "mmlspark-test" + } + + ) +} \ No newline at end of file diff --git a/project/CondaPlugin.scala b/project/CondaPlugin.scala new file mode 100644 index 0000000000..4e3e3ce005 --- /dev/null +++ b/project/CondaPlugin.scala @@ -0,0 +1,56 @@ +import BuildUtils.{osPrefix, runCmd} +import sbt._ +import Keys._ + +import scala.sys.process.Process + +//noinspection ScalaStyle +object CondaPlugin extends AutoPlugin { + override def trigger = allRequirements + + object autoImport { + val condaEnvName = settingKey[String]("Name of conda environment") + val cleanCondaEnvTask = TaskKey[Unit]("cleanCondaEnv", "create conda env") + val condaEnvLocation = TaskKey[File]("condaEnvLocation", "get install location of conda env") + val createCondaEnvTask = TaskKey[Unit]("createCondaEnv", "create conda env") + val activateCondaEnv = settingKey[Seq[String]]("commands to activate conda environment") + } + + import autoImport._ + override lazy val globalSettings: Seq[Setting[_]] = Seq( + condaEnvName := "mmlspark", + cleanCondaEnvTask := { + runCmd(Seq("conda", "env", "remove", "--name", condaEnvName.value, "-y")) + }, + condaEnvLocation := { + createCondaEnvTask.value + new File(Process("conda env list").lineStream.toList + .map(_.split("\\s+")) + .map(l => (l.head, l.reverse.head)) + .filter(p => p._1 == condaEnvName.value) + .head._2) + }, + createCondaEnvTask := { + val hasEnv = Process("conda env list").lineStream.toList + .map(_.split("\\s+").head).contains(condaEnvName.value) + if (!hasEnv) { + runCmd(Seq("conda", "env", "create", "-f", "environment.yaml")) + } else { + println("Found conda env " + condaEnvName.value) + } + }, + activateCondaEnv := { + if (sys.props("os.name").toLowerCase.contains("windows")) { + osPrefix ++ Seq("activate", condaEnvName.value, "&&") + } else { + Seq() + //TODO figure out why this doesent work + //Seq("/bin/bash", "-l", "-c", "source activate " + condaEnvName, "&&") + } + } + ) + + override def requires: Plugins = sbt.Plugins.empty + + override lazy val projectSettings: Seq[Setting[_]] = Seq() +} \ No newline at end of file diff --git a/project/build.scala b/project/build.scala index f7816cd5d4..06a930e33d 100644 --- a/project/build.scala +++ b/project/build.scala @@ -2,8 +2,12 @@ import java.io.File import java.lang.ProcessBuilder.Redirect object BuildUtils { + def join(root: File, folders: String*): File = { + folders.foldLeft(root) { case (f, s) => new File(f, s) } + } + def join(folders: String*): File = { - folders.tail.foldLeft(new File(folders.head)) { case (f, s) => new File(f, s) } + join(new File(folders.head), folders.tail: _*) } def isWindows: Boolean = { @@ -27,7 +31,7 @@ object BuildUtils { .redirectError(Redirect.INHERIT) .redirectOutput(Redirect.INHERIT) val env = pb.environment() - envVars.foreach(p =>env.put(p._1,p._2)) + envVars.foreach(p => env.put(p._1, p._2)) assert(pb.start().waitFor() == 0) } @@ -56,6 +60,7 @@ object BuildUtils { "--account-key", Secrets.storageKey) runCmd(osPrefix ++ command) } + def singleUploadToBlob(source: String, dest: String, container: String, @@ -76,6 +81,7 @@ object BuildUtils { val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory) (if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop) } + loop(dir) } @@ -91,7 +97,9 @@ object BuildUtils { zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/"))) val in = new BufferedInputStream(new FileInputStream(file), bufferSize) var b = 0 - while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) } + while (b >= 0) { + zip.write(data, 0, b); b = in.read(data, 0, bufferSize) + } in.close() zip.closeEntry() } diff --git a/project/plugins.sbt b/project/plugins.sbt index cc082cf59b..6f4bd427f2 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -4,4 +4,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.8") addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.0.0") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") \ No newline at end of file +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") diff --git a/src/main/python/setup.py b/src/main/python/setup.py deleted file mode 100644 index 3ba8474be2..0000000000 --- a/src/main/python/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -import os -from setuptools import setup, find_packages -import codecs -import os.path - - -def read(rel_path): - here = os.path.abspath(os.path.dirname(__file__)) - with codecs.open(os.path.join(here, rel_path), "r") as fp: - return fp.read() - - -def get_version(rel_path): - for line in read(rel_path).splitlines(): - if line.startswith("__version__"): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - return "0.0.0" - - -setup( - name="mmlspark", - version=get_version("mmlspark/__init__.py"), - description="Microsoft ML for Spark", - long_description="Microsoft ML for Apache Spark contains Microsoft's open source " - + "contributions to the Apache Spark ecosystem", - license="MIT", - packages=find_packages(), - url="https://github.com/Azure/mmlspark", - author="Microsoft", - author_email="mmlspark-support@microsoft.com", - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "Intended Audience :: Data Scientists", - "Topic :: Software Development :: Datascience Tools", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 3", - ], - zip_safe=True, - package_data={"mmlspark": ["../LICENSE.txt", "../README.txt"]}, -) diff --git a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala b/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala deleted file mode 100644 index 03785cbd8c..0000000000 --- a/src/main/scala/com/microsoft/ml/spark/codegen/CodegenConfig.scala +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.codegen - -import java.io.File - -import com.microsoft.ml.spark.build.BuildInfo - -object Config { - val DebugMode = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true" - - val TopDir = BuildInfo.baseDirectory - val Version = BuildInfo.version - val PackageName = BuildInfo.name - val TargetDir = new File(TopDir, s"target/scala-${BuildInfo.scalaVersion.slice(0,4)}") - val ScalaSrcDir = "src/main/scala" - - val GeneratedDir = new File(TargetDir, "generated") - val PackageDir = new File(GeneratedDir, "package") - val SrcDir = new File(GeneratedDir, "src") - val TestDir = new File(GeneratedDir, "test") - val DocDir = new File(GeneratedDir, "doc") - val TestDataDir = new File(GeneratedDir, "test-data") - - //Python Codegen Constant - val PySrcDir = new File(SrcDir, "python") - val PyPackageDir = new File(PackageDir, "python") - val PyTestDir = new File(TestDir, "python") - val PySrcOverrideDir = new File(TopDir, "src/main/python") - val PyTestOverrideDir = new File(TopDir, "src/test/python") - - //R Codegen Constants - val RSrcRoot = new File(SrcDir, "R") - val RSrcDir = new File(RSrcRoot, "mmlspark/R") - val RPackageDir = new File(PackageDir, "R") - val RTestDir = new File(RSrcRoot, "mmlspark/tests") - - val RTestOverrideDir = new File(TopDir, "src/test/R") - val RSrcOverrideDir = new File(TopDir, "src/main/R") - - //val rPackageFile = new File(rPackageDir, s"mmlspark-$mmlVer.zip") - - val InternalPrefix = "_" - val ScopeDepth = " " * 4 - - val CopyrightLines = - s"""|# Copyright (C) Microsoft Corporation. All rights reserved. - |# Licensed under the MIT License. See LICENSE in project root for information. - |""".stripMargin - - // The __init__.py file - def packageHelp(importString: String): String = { - s"""|$CopyrightLines - | - |"\"" - |MicrosoftML is a library of Python classes to interface with the - |Microsoft scala APIs to utilize Apache Spark to create distibuted - |machine learning models. - | - |MicrosoftML simplifies training and scoring classifiers and - |regressors, as well as facilitating the creation of models using the - |CNTK library, images, and text. - |"\"" - | - |__version__ = "${BuildInfo.pythonizedVersion}" - |__spark_package_version__ = "${BuildInfo.version}" - | - |$importString - |""".stripMargin - } -} diff --git a/src/test/python/mmlsparktest/nn/__init__.py b/src/test/python/mmlsparktest/nn/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/test/python/mmlsparktest/recommendation/__init__.py b/src/test/python/mmlsparktest/recommendation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/test/python/mmlsparktest/vw/__init__.py b/src/test/python/mmlsparktest/vw/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala b/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala deleted file mode 100644 index 4981013301..0000000000 --- a/src/test/scala/com/microsoft/ml/spark/cntk/CNTKTestUtils.scala +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.cntk - -import java.io.File - -import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.core.env.FileUtilities -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.image.UnrollImage -import org.apache.spark.ml.linalg.DenseVector -import org.apache.spark.sql._ -import com.microsoft.ml.spark.io.IOImplicits._ - -trait CNTKTestUtils extends TestBase { - - val filesRoot = BuildInfo.datasetDir.toString - val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString - val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString - val inputCol = "cntk_images" - val outputCol = "out" - val labelCol = "labels" - - val featureVectorLength = 3 * 32 * 32 - lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString - - def testModelDF(spark: SparkSession): DataFrame = { - import spark.implicits._ - spark.sparkContext.parallelize(Seq( - Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, - -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), - Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, - -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), - Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, - 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), - Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, - -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), - Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, - 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), - Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, - 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF - } - - def testImages(spark: SparkSession): DataFrame = { - val images = spark.read.image.load(imagePath) - - val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) - - unroll.transform(images).select(inputCol) - } - - def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { - import spark.implicits._ - if (outputDouble) { - List - .fill(rows)(List.fill(size)(0.0).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } else { - List - .fill(rows)(List.fill(size)(0.0.toFloat).toArray) - .zip(List.fill(rows)(0.0)) - .toDF(inputCol, labelCol) - } - } - - protected def compareToTestModel(result: DataFrame) = { - //TODO improve checks - assert(result.columns.toSet == Set(inputCol, outputCol)) - assert(result.count() == testModelDF(result.sparkSession).count()) - val max = result - .select(outputCol) - .collect() - .map(row => row.getAs[DenseVector](0).toArray.max) - .max - assert(max < 10 & max > -10) - } - -} diff --git a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala b/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala deleted file mode 100644 index 67d667e339..0000000000 --- a/src/test/scala/com/microsoft/ml/spark/codegen/CodeGen.scala +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.codegen - -import java.io.File -import com.microsoft.ml.spark.build.BuildInfo -import com.microsoft.ml.spark.codegen.Config._ -import com.microsoft.ml.spark.core.env.FileUtilities._ -import com.microsoft.ml.spark.core.test.base.TestBase -import com.microsoft.ml.spark.core.test.fuzzing.PyTestFuzzing -import com.microsoft.ml.spark.core.utils.JarLoadingUtils.instantiateServices -import org.apache.commons.io.FileUtils -import org.apache.commons.io.FilenameUtils._ - -object CodeGenUtils { - def clean(dir: File): Unit = if (dir.exists()) FileUtils.forceDelete(dir) - - def toDir(f: File): File = new File(f, File.separator) -} - -object CodeGen { - - import CodeGenUtils._ - - def generatePythonClasses(): Unit = { - instantiateServices[PythonWrappable].foreach { w => - w.makePyFile() - } - } - - def generateRClasses(): Unit = { - instantiateServices[RWrappable].foreach { w => - w.makeRFile() - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PySrcDir, "mmlspark"), packageFolder) - val packageString = if (packageFolder != "") packageFolder.replace("/", ".") else "" - val importStrings = - dir.listFiles.filter(_.isFile).sorted - .map(_.getName) - .filter(name => name.endsWith(".py") && !name.startsWith("_") && !name.startsWith("test")) - .map(name => s"from mmlspark$packageString.${getBaseName(name)} import *\n").mkString("") - writeFile(new File(dir, "__init__.py"), packageHelp(importStrings)) - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - //noinspection ScalaStyle - def generateRPackageData(): Unit = { - // description file; need to encode version as decimal - val today = new java.text.SimpleDateFormat("yyyy-MM-dd") - .format(new java.util.Date()) - - RSrcDir.mkdirs() - writeFile(new File(RSrcDir.getParentFile, "DESCRIPTION"), - s"""|Package: mmlspark - |Title: Access to MMLSpark via R - |Description: Provides an interface to MMLSpark. - |Version: ${BuildInfo.rVersion} - |Date: $today - |Author: Microsoft Corporation - |Maintainer: MMLSpark Team - |URL: https://github.com/Azure/mmlspark - |BugReports: https://github.com/Azure/mmlspark/issues - |Depends: - | R (>= 2.12.0) - |Imports: - | sparklyr - |License: MIT - |Suggests: - | testthat (>= 3.0.0) - |Config/testthat/edition: 3 - |""".stripMargin) - - writeFile(new File(RSrcDir, "package_register.R"), - s"""|#' @import sparklyr - |spark_dependencies <- function(spark_version, scala_version, ...) { - | spark_dependency( - | jars = c(), - | packages = c( - | sprintf("com.microsoft.ml.spark:mmlspark_%s:${BuildInfo.version}", scala_version) - | ), - | repositories = c("https://mmlspark.azureedge.net/maven") - | ) - |} - | - |#' @import sparklyr - |.onLoad <- function(libname, pkgname) { - | sparklyr::register_extension(pkgname) - |} - |""".stripMargin) - - writeFile(new File(RSrcDir.getParentFile, "mmlspark.Rproj"), - """ - |Version: 1.0 - | - |RestoreWorkspace: Default - |SaveWorkspace: Default - |AlwaysSaveHistory: Default - | - |EnableCodeIndexing: Yes - |UseSpacesForTab: Yes - |NumSpacesForTab: 4 - |Encoding: UTF-8 - | - |RnwWeave: Sweave - |LaTeX: pdfLaTeX - | - |BuildType: Package - |PackageUseDevtools: Yes - |PackageInstallArgs: --no-multiarch --with-keep.source - | - |""".stripMargin) - - } - - def rGen(): Unit = { - clean(RSrcRoot) - generateRPackageData() - generateRClasses() - FileUtils.copyDirectoryToDirectory(toDir(RSrcOverrideDir), toDir(RSrcDir)) - FileUtils.copyDirectoryToDirectory(toDir(RTestOverrideDir), toDir(RTestDir)) - } - - def pyGen(): Unit = { - clean(PySrcDir) - generatePythonClasses() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PySrcOverrideDir), toDir(PySrcDir)) - makeInitFiles() - } - - def main(args: Array[String]): Unit = { - clean(PackageDir) - rGen() - pyGen() - } - -} - -object TestGen { - - import CodeGenUtils._ - - def generatePythonTests(): Unit = { - instantiateServices[PyTestFuzzing[_]].foreach { ltc => - try { - ltc.makePyTestFile() - } catch { - case _: NotImplementedError => - println(s"ERROR: Could not generate test for ${ltc.testClassName} because of Complex Parameters") - } - } - } - - private def makeInitFiles(packageFolder: String = ""): Unit = { - val dir = new File(new File(PyTestDir, "mmlsparktest"), packageFolder) - writeFile(new File(dir, "__init__.py"), "") - dir.listFiles().filter(_.isDirectory).foreach(f => - makeInitFiles(packageFolder + "/" + f.getName) - ) - } - - def main(args: Array[String]): Unit = { - clean(TestDataDir) - clean(PyTestDir) - generatePythonTests() - TestBase.stopSparkSession() - FileUtils.copyDirectoryToDirectory(toDir(PyTestOverrideDir), toDir(PyTestDir)) - makeInitFiles() - } -} diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassificationModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py similarity index 97% rename from src/main/python/mmlspark/vw/VowpalWabbitClassifier.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py index ba9d72dc1e..ac33082148 100644 --- a/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py +++ b/vw/src/main/python/mmlspark/vw/VowpalWabbitClassifier.py @@ -1,14 +1,14 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier -from pyspark.ml.common import inherit_doc - -@inherit_doc -class VowpalWabbitClassifier(_VowpalWabbitClassifier): - - def setInitialModel(self, model): - """ - Initialize the estimator with a previously trained model. - """ - self._java_obj.setInitialModel(model._java_obj.getModel()) +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +from mmlspark.vw._VowpalWabbitClassifier import _VowpalWabbitClassifier +from pyspark.ml.common import inherit_doc + +@inherit_doc +class VowpalWabbitClassifier(_VowpalWabbitClassifier): + + def setInitialModel(self, model): + """ + Initialize the estimator with a previously trained model. + """ + self._java_obj.setInitialModel(model._java_obj.getModel()) diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBandit.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitContextualBanditModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressionModel.py diff --git a/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py b/vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py similarity index 100% rename from src/main/python/mmlspark/vw/VowpalWabbitRegressor.py rename to vw/src/main/python/mmlspark/vw/VowpalWabbitRegressor.py diff --git a/src/test/python/mmlsparktest/cyber/feature/__init__.py b/vw/src/main/python/mmlspark/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/feature/__init__.py rename to vw/src/main/python/mmlspark/vw/__init__.py diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasNumBits.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/HasSumcollisions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorUtils.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VectorZipper.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala similarity index 99% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala index 401daeadd2..59c983aac1 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBase.scala @@ -9,7 +9,7 @@ import com.microsoft.ml.spark.codegen.Wrappable import com.microsoft.ml.spark.core.contracts.HasWeightCol import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.utils.{ClusterUtil, StopWatch} -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.TaskContext import org.apache.spark.internal._ import org.apache.spark.ml.param._ diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala similarity index 94% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala index 46b85505e7..d020882991 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitBaseModel.scala @@ -4,15 +4,13 @@ package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.StreamUtilities -import com.microsoft.ml.spark.downloader.FaultToleranceUtils +import com.microsoft.ml.spark.core.utils.FaultToleranceUtils import org.apache.spark.binary.BinaryFileFormat -import org.apache.spark.ml.ComplexParamsWritable -import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.ml.param.{ByteArrayParam, DataFrameParam, Param} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.sql.types.StructType -import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitMurmur, VowpalWabbitNative} +import org.vowpalwabbit.spark.{VowpalWabbitArguments, VowpalWabbitExample, VowpalWabbitNative} import org.vowpalwabbit.spark.prediction.ScalarPrediction import scala.io.Source diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitClassifier.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitContextualBandit.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitInteractions.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitMurmurWithPrefix.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitRegressor.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/VowpalWabbitUtil.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala index 75dd1d651a..7ae43e536d 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/BooleanFeaturizer.scala @@ -1,52 +1,52 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). - * @param fieldIdx input field index. - * @param columnName used as feature name. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class BooleanFeaturizer(override val fieldIdx: Int, - override val columnName: String, - namespaceHash: Int, mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - featurize(0, row.getBoolean(fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: Boolean, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - if (value) { - indices += featureIdx + idx - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize boolean value into native VW structure. (True = hash(feature name):1, False ignored). + * @param fieldIdx input field index. + * @param columnName used as feature name. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class BooleanFeaturizer(override val fieldIdx: Int, + override val columnName: String, + namespaceHash: Int, mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[Boolean] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + featurize(0, row.getBoolean(fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: Boolean, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + if (value) { + indices += featureIdx + idx + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/ElementFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala index a8d6bf1353..deceb8ddd7 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/Featurizer.scala @@ -1,29 +1,29 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix -import org.apache.spark.sql.Row - -import scala.collection.mutable - -private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { - - val columnName: String - - /** - * Initialize hasher that already pre-hashes the column prefix. - */ - protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import com.microsoft.ml.spark.vw.VowpalWabbitMurmurWithPrefix +import org.apache.spark.sql.Row + +import scala.collection.mutable + +private[ml] abstract class Featurizer(val fieldIdx: Int) extends Serializable { + + val columnName: String + + /** + * Initialize hasher that already pre-hashes the column prefix. + */ + protected lazy val hasher: VowpalWabbitMurmurWithPrefix = new VowpalWabbitMurmurWithPrefix(columnName) + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala idiomatic, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/MapStringFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala index c7ade02c07..cc56a1081b 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/NumericFeaturizer.scala @@ -1,61 +1,61 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row -import org.vowpalwabbit.spark.VowpalWabbitMurmur - -import scala.collection.mutable - -/** - * Featurize numeric values into native VW structure. ((hash(column name):value) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int, - val zero: Numeric[T]) - extends Featurizer(fieldIdx) with ElementFeaturizer[T] { - - /** - * Pre-hashed feature index. - */ - val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) - - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getAs[T](fieldIdx), indices, values) - } - - def featurize(idx: Int, - value: T, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - // Note: 0 valued features are always filtered. - if (value != zero.zero) { - indices += featureIdx + idx - // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. - values += zero.toDouble(value) - } - () - } -} - -class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, - override val columnName: String, - override val namespaceHash: Int, - override val mask: Int, - override val zero: Numeric[T]) - extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { - override def featurize(row: Row, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = - if (!row.isNullAt(fieldIdx)) - super.featurize(row, indices, values) -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row +import org.vowpalwabbit.spark.VowpalWabbitMurmur + +import scala.collection.mutable + +/** + * Featurize numeric values into native VW structure. ((hash(column name):value) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class NumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int, + val zero: Numeric[T]) + extends Featurizer(fieldIdx) with ElementFeaturizer[T] { + + /** + * Pre-hashed feature index. + */ + val featureIdx: Int = mask & VowpalWabbitMurmur.hash(columnName, namespaceHash) + + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getAs[T](fieldIdx), indices, values) + } + + def featurize(idx: Int, + value: T, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + // Note: 0 valued features are always filtered. + if (value != zero.zero) { + indices += featureIdx + idx + // This is weird but zero is a numeric typeclass that is used to convert the generic T to a double. + values += zero.toDouble(value) + } + () + } +} + +class NullableNumericFeaturizer[T: Numeric](override val fieldIdx: Int, + override val columnName: String, + override val namespaceHash: Int, + override val mask: Int, + override val zero: Numeric[T]) + extends NumericFeaturizer[T](fieldIdx, columnName, namespaceHash, mask, zero) { + override def featurize(row: Row, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = + if (!row.isNullAt(fieldIdx)) + super.featurize(row, indices, values) +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/SeqFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala similarity index 97% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala index 804f6b482f..d582141522 100644 --- a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala +++ b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringFeaturizer.scala @@ -1,47 +1,47 @@ -// Copyright (C) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. See LICENSE in project root for information. - -package com.microsoft.ml.spark.vw.featurizer - -import org.apache.spark.sql.Row - -import scala.collection.mutable - -/** - * Featurize string into native VW structure. (hash(column name + value):1) - * @param fieldIdx input field index. - * @param columnName used as feature name prefix. - * @param namespaceHash pre-hashed namespace. - * @param mask bit mask applied to final hash. - */ -private[ml] class StringFeaturizer(override val fieldIdx: Int, - override val columnName: String, - val namespaceHash: Int, - val mask: Int) - extends Featurizer(fieldIdx) with ElementFeaturizer[String] { - - /** - * Featurize a single row. - * @param row input row. - * @param indices output indices. - * @param values output values. - * @note this interface isn't very Scala-esce, but it avoids lots of allocation. - * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) - */ - override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { - featurize(0, row.getString(fieldIdx), indices, values) - - () - } - - def featurize(idx: Int, - value: String, - indices: mutable.ArrayBuilder[Int], - values: mutable.ArrayBuilder[Double]): Unit = { - - if (value != null && !value.isEmpty) { - indices += mask & hasher.hash(value, namespaceHash) - values += 1.0 - } - } -} +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + +package com.microsoft.ml.spark.vw.featurizer + +import org.apache.spark.sql.Row + +import scala.collection.mutable + +/** + * Featurize string into native VW structure. (hash(column name + value):1) + * @param fieldIdx input field index. + * @param columnName used as feature name prefix. + * @param namespaceHash pre-hashed namespace. + * @param mask bit mask applied to final hash. + */ +private[ml] class StringFeaturizer(override val fieldIdx: Int, + override val columnName: String, + val namespaceHash: Int, + val mask: Int) + extends Featurizer(fieldIdx) with ElementFeaturizer[String] { + + /** + * Featurize a single row. + * @param row input row. + * @param indices output indices. + * @param values output values. + * @note this interface isn't very Scala-esce, but it avoids lots of allocation. + * Also due to SparseVector limitations we don't support 64bit indices (e.g. indices are signed 32bit ints) + */ + override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { + featurize(0, row.getString(fieldIdx), indices, values) + + () + } + + def featurize(idx: Int, + value: String, + indices: mutable.ArrayBuilder[Int], + values: mutable.ArrayBuilder[Double]): Unit = { + + if (value != null && !value.isEmpty) { + indices += mask & hasher.hash(value, namespaceHash) + values += 1.0 + } + } +} diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StringSplitFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/StructFeaturizer.scala diff --git a/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala b/vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala similarity index 100% rename from src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala rename to vw/src/main/scala/com/microsoft/ml/spark/vw/featurizer/VectorFeaturizer.scala diff --git a/src/test/python/mmlsparktest/cyber/utils/__init__.py b/vw/src/test/python/mmlsparktest/vw/__init__.py similarity index 100% rename from src/test/python/mmlsparktest/cyber/utils/__init__.py rename to vw/src/test/python/mmlsparktest/vw/__init__.py diff --git a/src/test/python/mmlsparktest/vw/test_vw.py b/vw/src/test/python/mmlsparktest/vw/test_vw.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw.py rename to vw/src/test/python/mmlsparktest/vw/test_vw.py diff --git a/src/test/python/mmlsparktest/vw/test_vw_cb.py b/vw/src/test/python/mmlsparktest/vw/test_vw_cb.py similarity index 100% rename from src/test/python/mmlsparktest/vw/test_vw_cb.py rename to vw/src/test/python/mmlsparktest/vw/test_vw_cb.py diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VWContextualBandidSpec.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVectorZipper.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitClassifier.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitFeaturizer.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitInteractions.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitMurmurWithPrefix.scala diff --git a/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala b/vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala similarity index 100% rename from src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala rename to vw/src/test/scala/com/microsoft/ml/spark/vw/VerifyVowpalWabbitRegressor.scala