Data beaver

library(mvtnorm)
library(ggplot2)
library(dplyr)

## 
## Dołączanie pakietu: 'dplyr'

## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

# Different randomization functions with different bias

# Totally random

random_move = function(x, y){
  yoff = runif(1, -0.1, 0.1)
  xoff = runif(1, -0.1, 0.1)
  nx = x + xoff
  ny = y + yoff
  return(c(nx, ny))
}

# Four centers
random_move_four = function(x, y){
  yoff = runif(1, -0.1, 0.1)
  if (x > 35){
    xoff = runif(1, -0.05, 0.1)
  } else {
    xoff = runif(1, -0.1, 0.05)
  }
  
  if (y > -42.5){
    yoff = runif(1, -0.05, 0.1)
  } else {
    yoff = runif(1, -0.1, 0.05)
  }
  
  nx = x + xoff
  ny = y + yoff
  return(c(nx, ny))
}

# Several strips
random_move_strips = function(x, y){
  if (x > 10* (round(x/10))){
    xoff = runif(1, -0.05, 0.1)
  } else {
    xoff = runif(1, -0.1, 0.05)
  }
  
  if (y > -42.5){
    yoff = runif(1, -0.05, 0.1)
  } else {
    yoff = runif(1, -0.1, 0.05)
  }
  
  nx = x + xoff
  ny = y + yoff
  return(c(nx, ny))
}

# Two strips
random_move_two_strips = function(x, y){
  yoff = runif(1, -0.1, 0.1)
  if (x > 35){
    if (x < 55){
    xoff = runif(1, -0.05, 0.1)
    } else{
    xoff = runif(1, -0.1, 0.1)
    }
  } else {
    if (x > 15){
      xoff = runif(1, -0.1, 0.05)
    } else {
    xoff = runif(1, -0.1, 0.1)
    }
  }
  nx = x + xoff
  ny = y + yoff
  return(c(nx, ny))
}


# Check whether parameters of new multivariate distribution are the same.
check = function(epsilon, new_s, cr, cv, xm, ym, xsd, ysd){
  
  ncr = cor(new_s[,1], new_s[,2])
  res = T
  
  if (!((ncr > (cr - epsilon)) & (ncr < (cr + epsilon)))){
    return(F)
  } 
  
  ncv = cov(new_s[,1], new_s[,2])
  if (!((ncv > (cv - epsilon)) & (ncv < (cv + epsilon)))){
    return(F)
  }
  
  nxm = mean(new_s[,1])
  if (!((nxm > (xm - epsilon)) & (nxm < (xm + epsilon)))){
    return(F)
  }
  
  nym = mean(new_s[,2])
  if (!((nym > (ym - epsilon)) & (nym < (ym + epsilon)))){
    return(F)
  }
  
  nxsd = sd(new_s[,1])
  if (!((nxsd > (xsd - epsilon)) & (nxsd < (xsd + epsilon)))){
    return(F)
  }
    
  nysd = sd(new_s[,2])
  if (!((nysd > (ysd - epsilon)) & (nysd < (ysd + epsilon)))){
    return(F)
  }
  
  return(T)
}

# Move random point using move function k times and check parameteres of resulting dist
move_and_check = function(move, k, epsilon, sym, cr, cv, xm, ym, xsd, ysd){
  n_rows = nrow(sym)
  fail = 0
  for (i in 1:k){
    r_row = round(runif(1, 1, n_rows))
    new_xy = move(sym[r_row, 1], sym[r_row, 2])
    new_sym = sym
    new_sym[r_row, 1] = new_xy[1]
    new_sym[r_row, 2] = new_xy[2]
    if (check(epsilon, new_sym, cr, cv, xm, ym, xsd, ysd)){
      sym = new_sym
    } else {
      fail = fail + 1
    }
  }
  print(fail)
  return(sym)
}

beav = read.csv("beaver_dataset.csv")
beav$x = beav$x/10
beav$y = beav$y/10 
cr = cor(beav[,"x"], beav[, "y"])
cv = cov(beav[,"x"], beav[, "y"])
xm = mean(beav[,"x"])
ym = mean(beav[,"y"])
xsd = sd(beav[,"x"])
ysd = sd(beav[,"y"])

# Generate three differently looking distributions with the same parameters as sym

tot_random = move_and_check(random_move,
                            1000000, 0.1, as.matrix(beav[,c("x","y")]), cr, cv, xm, ym, xsd, ysd)

## [1] 30473

two_strips = move_and_check(random_move_two_strips,
                            1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)

## [1] 395556

many_strips = move_and_check(random_move_strips,
                            1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)

## [1] 604964

four_centers = move_and_check(random_move_four,
                            1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)

## [1] 646505

par(mfrow = c(2,2))

random = data.frame(tot_random)
colnames(random) = c("x", "y")
random$dataset = "totally random"

two_strips = data.frame(two_strips)
colnames(two_strips) = c("x", "y")
two_strips$dataset = "two strips"

many_strips = data.frame(many_strips)
colnames(many_strips) = c("x", "y")
many_strips$dataset = "many strips"

four_centers = data.frame(four_centers)
colnames(four_centers) = c("x", "y")
four_centers$dataset = "four centers"

dataset = rbind(beav[,c("x", "y", "dataset")], random, two_strips, many_strips, four_centers)

ggplot(data = dataset) + geom_point(aes(x = x, y = y)) + facet_wrap(~dataset, ncol=2)

library(dplyr)
group_by(dataset, dataset) %>% summarise("Korelacja" = cor(x, y),
                                                              "Kowariancja" = cov(x, y),
                                                              "Średnia X" = mean(x),
                                                              "Średnia Y" = mean(y),
                                                              "Odchylenie standardowe X" = sd(x),
                                                              "Odchylenie standardowe Y" = sd(y),
)

## # A tibble: 5 x 7
##   dataset     Korelacja Kowariancja `Średnia X` `Średnia Y` `Odchylenie standar…
##   <chr>           <dbl>       <dbl>       <dbl>       <dbl>                <dbl>
## 1 beaver          0.495        94.9        36.8       -85.9                 18.6
## 2 four cente…     0.488        94.9        36.9       -86.0                 18.7
## 3 many strips     0.488        94.9        36.9       -86.0                 18.7
## 4 totally ra…     0.489        94.9        36.9       -85.9                 18.7
## 5 two strips      0.490        94.9        36.9       -85.8                 18.7
## # … with 1 more variable: Odchylenie standardowe Y <dbl>

write.csv(dataset, "beaver_full.csv")

Data beaver

Bartosz Maćkiewicz

27 stycznia 2020