011_02_IntroToSimulation

Core ideas in Monte Carlo simulation

modular arithmetic gives pseudo-random streams that are indistiguishable from 'true' Uniformly distributed samples in integers from $\{0,1,2,...,m\}$
by diving the integer streams from above by $m$ we get samples from $\{0/m,1/m,...,(m-1)/m\}$ and "pretend" this to be samples from the Uniform(0,1) RV
we can use inverse distribution function of von Neumann's rejection sampler to convert samples from Uniform(0,1) RV to the following:
- any other random variable
- vector of random variables that could be dependent
- or more generally other random structures:
  - random graphs and networks
  - random walks or (sensible perturbations of live traffic data on open street maps for hypothesis tests)
  - models of interacting paticle systems in ecology / chemcal physics, etc...

import breeze.stats.distributions._

val poi = new Poisson(3.0);

import breeze.stats.distributions._ poi: breeze.stats.distributions.Poisson = Poisson(3.0)

val s = poi.sample(5); // let's draw five samples - black-box

s: IndexedSeq[Int] = Vector(4, 3, 1, 6, 1)

s.map( x => poi.probabilityOf(x) ) // PMF

res0: IndexedSeq[Double] = Vector(0.16803135574154085, 0.22404180765538775, 0.14936120510359185, 0.05040940672246224, 0.14936120510359185)

val doublePoi = for(x <- poi) yield x.toDouble // meanAndVariance requires doubles, but Poisson samples over Ints

doublePoi: breeze.stats.distributions.Rand[Double] = MappedRand(Poisson(3.0),<function1>)

breeze.stats.meanAndVariance(doublePoi.samples.take(1000));

res3: breeze.stats.MeanAndVariance = MeanAndVariance(3.0330000000000017,2.8287397397397376,1000)

(poi.mean, poi.variance) // population mean and variance

res4: (Double, Double) = (3.0,3.0)

Show code

val expo = new Exponential(0.5);

expo: breeze.stats.distributions.Exponential = Exponential(0.5)

expo.rate // what is the rate parameter

res5: Double = 0.5

expo.probability(0, math.log(2) * expo.rate)

res6: Double = 0.5

expo.probability(math.log(2) * expo.rate, 10000.0)

res7: Double = 0.5

expo.probability(0.0, 1.5)

res10: Double = 0.950212931632136

1 - math.exp(-3.0) // the CDF of the Exponential RV with rate parameter 3

res11: Double = 0.950212931632136

Show code

val samples = expo.sample(2).sorted; // built-in black box - we will roll our own shortly in Spark

samples: IndexedSeq[Double] = Vector(0.4257656412890666, 1.204167357779466)

expo.probability(samples(0), samples(1));

res12: Double = 0.33679595257034345

breeze.stats.meanAndVariance(expo.samples.take(10000)); // mean and variance of the sample

res13: breeze.stats.MeanAndVariance = MeanAndVariance(2.0277569773572064,4.208974391284182,10000)

(1 / expo.rate, 1 / (expo.rate * expo.rate)) // mean and variance of the population

res14: (Double, Double) = (2.0,4.0)

import spark.implicits._
import org.apache.spark.sql.functions._

import spark.implicits._ import org.apache.spark.sql.functions._

val df = spark.range(1000).toDF("Id") // just make a DF of 100 row indices

df: org.apache.spark.sql.DataFrame = [Id: bigint]

df.show(5)

+---+ | Id| +---+ | 0| | 1| | 2| | 3| | 4| +---+ only showing top 5 rows

val dfRand = df.select($"Id", rand(seed=1234567) as "rand") // add a column of random numbers in (0,1)

dfRand: org.apache.spark.sql.DataFrame = [Id: bigint, rand: double]

dfRand.show(5) // these are first 5 of the 1000 samples from the Uniform(0,1) RV

+---+------------------+ | Id| rand| +---+------------------+ | 0|0.2289181799234461| | 1|0.9756456161051732| | 2|0.7781702473664945| | 3|0.5585984240683788| | 4|0.8305446150005453| +---+------------------+ only showing top 5 rows

val dfRand = df.select($"Id", rand(seed=1234567) as "rand") // add a column of random numbers in (0,1)
dfRand.show(5) // these are first 5 of the 1000 samples from the Uniform(0,1) RV

val dfRand = df.select($"Id", rand(seed=879664) as "rand") // add a column of random numbers in (0,1)
dfRand.show(5) // these are first 5 of the 1000 samples from the Uniform(0,1) RV

+---+-------------------+ | Id| rand| +---+-------------------+ | 0|0.08760742852149905| | 1| 0.4005938791470717| | 2| 0.8708129699210699| | 3|0.39454412971200903| | 4| 0.5635001057818174| +---+-------------------+ only showing top 5 rows dfRand: org.apache.spark.sql.DataFrame = [Id: bigint, rand: double]

val dfRand = df.select($"Id", rand(seed=1234567) as "rand") // add a column of random numbers in (0,1)
               .withColumn("one",lit(1.0))
               .withColumn("rate",lit(0.5))

dfRand: org.apache.spark.sql.DataFrame = [Id: bigint, rand: double ... 2 more fields]

dfRand.show(5)

+---+------------------+---+----+ | Id| rand|one|rate| +---+------------------+---+----+ | 0|0.2289181799234461|1.0| 0.5| | 1|0.9756456161051732|1.0| 0.5| | 2|0.7781702473664945|1.0| 0.5| | 3|0.5585984240683788|1.0| 0.5| | 4|0.8305446150005453|1.0| 0.5| +---+------------------+---+----+ only showing top 5 rows

val dfExpRand = dfRand.withColumn("expo_sample", -($"one" / $"rate") * log($"one" - $"rand")) // samples from expo(rate=0.5)

dfExpRand: org.apache.spark.sql.DataFrame = [Id: bigint, rand: double ... 3 more fields]

dfExpRand.show(5)

+---+------------------+---+----+------------------+ | Id| rand|one|rate| expo_sample| +---+------------------+---+----+------------------+ | 0|0.2289181799234461|1.0| 0.5| 0.519921578060948| | 1|0.9756456161051732|1.0| 0.5| 7.430086817819349| | 2|0.7781702473664945|1.0| 0.5|3.0116901426840474| | 3|0.5585984240683788|1.0| 0.5|1.6356004297263063| | 4|0.8305446150005453|1.0| 0.5|3.5503312043026414| +---+------------------+---+----+------------------+ only showing top 5 rows

display(dfExpRand)

dfExpRand.describe().show() // look sensible

+-------+-----------------+--------------------+----+----+--------------------+ |summary| Id| rand| one|rate| expo_sample| +-------+-----------------+--------------------+----+----+--------------------+ | count| 1000| 1000|1000|1000| 1000| | mean| 499.5| 0.49368134205225334| 1.0| 0.5| 1.98855225198386| | stddev|288.8194360957494| 0.2925326105055967| 0.0| 0.0| 2.077189631386989| | min| 0|6.881987320686012E-4| 1.0| 0.5|0.001376871299039548| | max| 999| 0.9999092082356841| 1.0| 0.5| 18.613883955674265| +-------+-----------------+--------------------+----+----+--------------------+

val expoSamplesDF = spark.range(1000000000).toDF("Id") // just make a DF of 100 row indices
               .select($"Id", rand(seed=1234567) as "rand") // add a column of random numbers in (0,1)
               .withColumn("one",lit(1.0))
               .withColumn("rate",lit(0.5))
               .withColumn("expo_sample", -($"one" / $"rate") * log($"one" - $"rand"))

expoSamplesDF: org.apache.spark.sql.DataFrame = [Id: bigint, rand: double ... 3 more fields]

expoSamplesDF.describe().show()

+-------+-------------------+--------------------+----------+----------+--------------------+ |summary| Id| rand| one| rate| expo_sample| +-------+-------------------+--------------------+----------+----------+--------------------+ | count| 1000000000| 1000000000|1000000000|1000000000| 1000000000| | mean| 4.99999999147278E8| 0.4999955993951372| 1.0| 0.5| 1.999981045300269| | stddev|2.886751347391513E8| 0.28868028535174534| 0.0| 0.0| 1.999982775685239| | min| 0|2.564836121266012...| 1.0| 0.5|5.129672243189862...| | max| 999999999| 0.9999999985370288| 1.0| 0.5| 40.68559281689348| +-------+-------------------+--------------------+----------+----------+--------------------+

SDS-2.x, Scalable Data Engineering Science

Introduction to Simulation

Core ideas in Monte Carlo simulation

breeze.stats.distributions

Exponential random Variable

Drawing samples from Exponential RV

Pseudo Random Numbers in Spark