library(mvtnorm)
library(ggplot2)
library(dplyr)
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
# Different randomization functions with different bias
# Totally random
random_move = function(x, y){
yoff = runif(1, -0.1, 0.1)
xoff = runif(1, -0.1, 0.1)
nx = x + xoff
ny = y + yoff
return(c(nx, ny))
}
# Four centers
random_move_four = function(x, y){
yoff = runif(1, -0.1, 0.1)
if (x > 35){
xoff = runif(1, -0.05, 0.1)
} else {
xoff = runif(1, -0.1, 0.05)
}
if (y > -42.5){
yoff = runif(1, -0.05, 0.1)
} else {
yoff = runif(1, -0.1, 0.05)
}
nx = x + xoff
ny = y + yoff
return(c(nx, ny))
}
# Several strips
random_move_strips = function(x, y){
if (x > 10* (round(x/10))){
xoff = runif(1, -0.05, 0.1)
} else {
xoff = runif(1, -0.1, 0.05)
}
if (y > -42.5){
yoff = runif(1, -0.05, 0.1)
} else {
yoff = runif(1, -0.1, 0.05)
}
nx = x + xoff
ny = y + yoff
return(c(nx, ny))
}
# Two strips
random_move_two_strips = function(x, y){
yoff = runif(1, -0.1, 0.1)
if (x > 35){
if (x < 55){
xoff = runif(1, -0.05, 0.1)
} else{
xoff = runif(1, -0.1, 0.1)
}
} else {
if (x > 15){
xoff = runif(1, -0.1, 0.05)
} else {
xoff = runif(1, -0.1, 0.1)
}
}
nx = x + xoff
ny = y + yoff
return(c(nx, ny))
}
# Check whether parameters of new multivariate distribution are the same.
check = function(epsilon, new_s, cr, cv, xm, ym, xsd, ysd){
ncr = cor(new_s[,1], new_s[,2])
res = T
if (!((ncr > (cr - epsilon)) & (ncr < (cr + epsilon)))){
return(F)
}
ncv = cov(new_s[,1], new_s[,2])
if (!((ncv > (cv - epsilon)) & (ncv < (cv + epsilon)))){
return(F)
}
nxm = mean(new_s[,1])
if (!((nxm > (xm - epsilon)) & (nxm < (xm + epsilon)))){
return(F)
}
nym = mean(new_s[,2])
if (!((nym > (ym - epsilon)) & (nym < (ym + epsilon)))){
return(F)
}
nxsd = sd(new_s[,1])
if (!((nxsd > (xsd - epsilon)) & (nxsd < (xsd + epsilon)))){
return(F)
}
nysd = sd(new_s[,2])
if (!((nysd > (ysd - epsilon)) & (nysd < (ysd + epsilon)))){
return(F)
}
return(T)
}
# Move random point using move function k times and check parameteres of resulting dist
move_and_check = function(move, k, epsilon, sym, cr, cv, xm, ym, xsd, ysd){
n_rows = nrow(sym)
fail = 0
for (i in 1:k){
r_row = round(runif(1, 1, n_rows))
new_xy = move(sym[r_row, 1], sym[r_row, 2])
new_sym = sym
new_sym[r_row, 1] = new_xy[1]
new_sym[r_row, 2] = new_xy[2]
if (check(epsilon, new_sym, cr, cv, xm, ym, xsd, ysd)){
sym = new_sym
} else {
fail = fail + 1
}
}
print(fail)
return(sym)
}
beav = read.csv("beaver_dataset.csv")
beav$x = beav$x/10
beav$y = beav$y/10
cr = cor(beav[,"x"], beav[, "y"])
cv = cov(beav[,"x"], beav[, "y"])
xm = mean(beav[,"x"])
ym = mean(beav[,"y"])
xsd = sd(beav[,"x"])
ysd = sd(beav[,"y"])
# Generate three differently looking distributions with the same parameters as sym
tot_random = move_and_check(random_move,
1000000, 0.1, as.matrix(beav[,c("x","y")]), cr, cv, xm, ym, xsd, ysd)
## [1] 30473
two_strips = move_and_check(random_move_two_strips,
1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)
## [1] 395556
many_strips = move_and_check(random_move_strips,
1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)
## [1] 604964
four_centers = move_and_check(random_move_four,
1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd)
## [1] 646505
par(mfrow = c(2,2))
random = data.frame(tot_random)
colnames(random) = c("x", "y")
random$dataset = "totally random"
two_strips = data.frame(two_strips)
colnames(two_strips) = c("x", "y")
two_strips$dataset = "two strips"
many_strips = data.frame(many_strips)
colnames(many_strips) = c("x", "y")
many_strips$dataset = "many strips"
four_centers = data.frame(four_centers)
colnames(four_centers) = c("x", "y")
four_centers$dataset = "four centers"
dataset = rbind(beav[,c("x", "y", "dataset")], random, two_strips, many_strips, four_centers)
ggplot(data = dataset) + geom_point(aes(x = x, y = y)) + facet_wrap(~dataset, ncol=2)

library(dplyr)
group_by(dataset, dataset) %>% summarise("Korelacja" = cor(x, y),
"Kowariancja" = cov(x, y),
"Średnia X" = mean(x),
"Średnia Y" = mean(y),
"Odchylenie standardowe X" = sd(x),
"Odchylenie standardowe Y" = sd(y),
)
## # A tibble: 5 x 7
## dataset Korelacja Kowariancja `Średnia X` `Średnia Y` `Odchylenie standar…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 beaver 0.495 94.9 36.8 -85.9 18.6
## 2 four cente… 0.488 94.9 36.9 -86.0 18.7
## 3 many strips 0.488 94.9 36.9 -86.0 18.7
## 4 totally ra… 0.489 94.9 36.9 -85.9 18.7
## 5 two strips 0.490 94.9 36.9 -85.8 18.7
## # … with 1 more variable: Odchylenie standardowe Y <dbl>
write.csv(dataset, "beaver_full.csv")