import scala.util.Random import scala.util.Random._ // make a sample to produce a mixture of two normal RVs with standard deviation 1 but with different location or mean parameters def myMixtureOf2Normals( normalLocation: Double, abnormalLocation: Double, normalWeight: Double, r: Random) : (String, Double) = { val sample = if (r.nextDouble <= normalWeight) {r.nextGaussian+normalLocation } else {r.nextGaussian + abnormalLocation} Thread.sleep(5L) // sleep 5 milliseconds val now = (new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS")).format(new java.util.Date()) return (now,sample) }
import scala.util.Random
import scala.util.Random._
myMixtureOf2Normals: (normalLocation: Double, abnormalLocation: Double, normalWeight: Double, r: scala.util.Random)(String, Double)
val r = new Random(1L) println(myMixtureOf2Normals(1.0, 10.0, 0.99, r), myMixtureOf2Normals(1.0, 10.0, 0.99, r)) // should always produce samples as (0.5876430182311466,-0.34037937678788865) when seed = 1L
((2019-05-31 15:18:40.210,0.5876430182311466),(2019-05-31 15:18:40.215,-0.34037937678788865))
r: scala.util.Random = scala.util.Random@76d00bb2
val r = new Random(12345L) // set seed for reproducibility var a = 0; // for loop execution to write files to distributed fs for( a <- 1 to 5){ // make a DataSet val data = sc.parallelize(Vector.fill(100){myMixtureOf2Normals(1.0, 10.0, 0.99, r)}) // 100 samples from mixture .coalesce(1) // this is to make sure that we have only one partition per dir .toDF.as[(String,Double)] val minute = (new java.text.SimpleDateFormat("mm")).format(new java.util.Date()) val second = (new java.text.SimpleDateFormat("ss")).format(new java.util.Date()) // write to dbfs data.write.mode(SaveMode.Overwrite).csv("/datasets/streamingFilesNormalMixture/" + minute +"_" + second) Thread.sleep(5000L) // sleep 5 seconds }
r: scala.util.Random = scala.util.Random@2334e6f0
a: Int = 0
df_csv.show(10,false) // first 10
+-----------------------+-------------------+
|_c0 |_c1 |
+-----------------------+-------------------+
|2019-05-31 15:20:19.483|-0.5190278662580565|
|2019-05-31 15:20:19.488|1.2549405940975034 |
|2019-05-31 15:20:19.493|2.4267606721380233 |
|2019-05-31 15:20:19.498|0.21858105660909444|
|2019-05-31 15:20:19.503|1.7701229392924476 |
|2019-05-31 15:20:19.508|0.08326770280505069|
|2019-05-31 15:20:19.514|11.539205812425335 |
|2019-05-31 15:20:19.519|0.612370126029857 |
|2019-05-31 15:20:19.524|1.299073306785623 |
|2019-05-31 15:20:19.529|2.6939073650678083 |
+-----------------------+-------------------+
only showing top 10 rows
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never