#!/usr/bin/ruby

# ================================================================
# Produces a variety of categorized pseudo-random data in support
# of Miller documentation.
# ================================================================
#
# Sample output:
#
# ./datagen/mkdat2 6 | mlr --opprint cat
# color  shape    flag i u                   v                   w                   x
# purple circle   0    1 0.5637512757306459  0.4981743566291943  0.36884254478967105 4.480962906000271
# orange square   1    2 0.5232158098904274  0.33717333156510765 0.44646982455699713 5.807558719657881
# red    circle   1    3 0.5084894411433544  0.7025118761232125  0.672558508897624   5.094127602621387
# blue   square   0    4 0.3145642940519666  0.3040179460903778  0.6388947856924174  7.517194060345607
# yellow triangle 1    5 0.06776212921515201 0.8517576443958519  0.4319941923080997  4.955913436917771
# red    square   0    6 0.24407904404655156 0.4297654986740608  0.6681496181121647  4.702469482713694
# ================================================================

# Vary repeat counts to obtain non-uniform distribution on colors
$colors=%w(
  red    red    red    red    red    red    red    red    red    red    red    red    red    red    red    red
  green  green  green  green
  blue   blue   blue   blue   blue
  orange
  yellow yellow yellow yellow yellow
  purple purple purple purple
)
# Vary repeat counts to obtain non-uniform distribution on shapes
$shapes = %w(
  circle circle circle
  square square square square square
  triangle triangle triangle triangle
)
# Vary per-color probabilities of flag==1
$color_flag_ps = {
  'blue'   => 0.6,
  'green'  => 0.2,
  'orange' => 0.5,
  'purple' => 0.1,
  'red'    => 0.3,
  'yellow' => 0.9,
}
# For autocorrelation of time series by color
$eta = 0.99

# ================================================================
def main()
  n = 100000
  n = Integer ARGV[0] if ARGV.length == 1

  history_keys = []
  $colors.uniq.each do |color|
    $shapes.uniq.each do |shape|
      history_keys << color + '-' + shape
    end
  end

  ht = HistoryTracker.new(history_keys, $eta)

  n.times do |i|
    color = $colors[rand $colors.length]
    shape = $shapes[rand $shapes.length]
    flag  = bernoulli($color_flag_ps[color])

    # u: plain unit-interval uniform
    # v: similar, except for circles, pairwise-correlate u and v
    u = rand
    v = rand
    if color == 'red' && shape == 'circle'
      v = u + 0.2*(rand-0.5)
    end

    # w: autocorrelated time series by color. If you look at stats of w you'll
    # see roughly uniform distribution.  But if you follow each color/shape
    # combination then you'll see small deltas from one to the next.
    w = ht.emit(color + '-' + shape)

    # x: gaussian (boring)
    x = gaussian

    puts "color=#{color},shape=#{shape},flag=#{flag},i=#{i+1},u=#{u},v=#{v},w=#{w},x=#{x}"
  end
 end

# ================================================================
# Roughly normal between 0 and 10, mean 5, stddev about 1.2
def gaussian()
  10*(rand+rand+rand+rand+rand+rand)/6
end

def bernoulli(p)
  if rand < p
    1
  else
    0
  end
end

# Autocorrelated time series per color.
# For reference please see http://johnkerl.org/rcm/eta.pdf.
class HistoryTracker
  def initialize(keys, eta)
    @keys = keys
    @prevs = {}
    @keys.each do |key|
      @prevs[key] = rand
    end
    @eta = eta
    @etac = 1.0 - eta
    @s = Math.sqrt((1+eta)/(1-eta))
    @a = 0.5 * (1 - @s)
    @b = 0.5 * (1 + @s)
  end
  def emit(key)
    @prevs[key] = @eta * @prevs[key] + @etac * rand
  end
end

# ================================================================
begin
  main()
rescue Errno::EPIPE # E.g. we're piped to head
  exit 0
end
