--- title: "Data beaver" author: "Bartosz Maćkiewicz" date: "27 stycznia 2020" output: html_document editor_options: chunk_output_type: console --- ```{r} library(mvtnorm) library(ggplot2) library(dplyr) ``` ```{r echo=T} # Different randomization functions with different bias # Totally random random_move = function(x, y){ yoff = runif(1, -0.1, 0.1) xoff = runif(1, -0.1, 0.1) nx = x + xoff ny = y + yoff return(c(nx, ny)) } # Four centers random_move_four = function(x, y){ yoff = runif(1, -0.1, 0.1) if (x > 35){ xoff = runif(1, -0.05, 0.1) } else { xoff = runif(1, -0.1, 0.05) } if (y > -42.5){ yoff = runif(1, -0.05, 0.1) } else { yoff = runif(1, -0.1, 0.05) } nx = x + xoff ny = y + yoff return(c(nx, ny)) } # Several strips random_move_strips = function(x, y){ if (x > 10* (round(x/10))){ xoff = runif(1, -0.05, 0.1) } else { xoff = runif(1, -0.1, 0.05) } if (y > -42.5){ yoff = runif(1, -0.05, 0.1) } else { yoff = runif(1, -0.1, 0.05) } nx = x + xoff ny = y + yoff return(c(nx, ny)) } # Two strips random_move_two_strips = function(x, y){ yoff = runif(1, -0.1, 0.1) if (x > 35){ if (x < 55){ xoff = runif(1, -0.05, 0.1) } else{ xoff = runif(1, -0.1, 0.1) } } else { if (x > 15){ xoff = runif(1, -0.1, 0.05) } else { xoff = runif(1, -0.1, 0.1) } } nx = x + xoff ny = y + yoff return(c(nx, ny)) } # Check whether parameters of new multivariate distribution are the same. check = function(epsilon, new_s, cr, cv, xm, ym, xsd, ysd){ ncr = cor(new_s[,1], new_s[,2]) res = T if (!((ncr > (cr - epsilon)) & (ncr < (cr + epsilon)))){ return(F) } ncv = cov(new_s[,1], new_s[,2]) if (!((ncv > (cv - epsilon)) & (ncv < (cv + epsilon)))){ return(F) } nxm = mean(new_s[,1]) if (!((nxm > (xm - epsilon)) & (nxm < (xm + epsilon)))){ return(F) } nym = mean(new_s[,2]) if (!((nym > (ym - epsilon)) & (nym < (ym + epsilon)))){ return(F) } nxsd = sd(new_s[,1]) if (!((nxsd > (xsd - epsilon)) & (nxsd < (xsd + epsilon)))){ return(F) } nysd = sd(new_s[,2]) if (!((nysd > (ysd - epsilon)) & (nysd < (ysd + epsilon)))){ return(F) } return(T) } # Move random point using move function k times and check parameteres of resulting dist move_and_check = function(move, k, epsilon, sym, cr, cv, xm, ym, xsd, ysd){ n_rows = nrow(sym) fail = 0 for (i in 1:k){ r_row = round(runif(1, 1, n_rows)) new_xy = move(sym[r_row, 1], sym[r_row, 2]) new_sym = sym new_sym[r_row, 1] = new_xy[1] new_sym[r_row, 2] = new_xy[2] if (check(epsilon, new_sym, cr, cv, xm, ym, xsd, ysd)){ sym = new_sym } else { fail = fail + 1 } } print(fail) return(sym) } ``` ```{r} beav = read.csv("beaver_dataset.csv") beav$x = beav$x/10 beav$y = beav$y/10 cr = cor(beav[,"x"], beav[, "y"]) cv = cov(beav[,"x"], beav[, "y"]) xm = mean(beav[,"x"]) ym = mean(beav[,"y"]) xsd = sd(beav[,"x"]) ysd = sd(beav[,"y"]) ``` ```{r} # Generate three differently looking distributions with the same parameters as sym tot_random = move_and_check(random_move, 1000000, 0.1, as.matrix(beav[,c("x","y")]), cr, cv, xm, ym, xsd, ysd) ``` ```{r} two_strips = move_and_check(random_move_two_strips, 1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd) ``` ```{r} many_strips = move_and_check(random_move_strips, 1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd) ``` ```{r} four_centers = move_and_check(random_move_four, 1000000, 0.1, tot_random, cr, cv, xm, ym, xsd, ysd) ``` ```{r} par(mfrow = c(2,2)) random = data.frame(tot_random) colnames(random) = c("x", "y") random$dataset = "totally random" two_strips = data.frame(two_strips) colnames(two_strips) = c("x", "y") two_strips$dataset = "two strips" many_strips = data.frame(many_strips) colnames(many_strips) = c("x", "y") many_strips$dataset = "many strips" four_centers = data.frame(four_centers) colnames(four_centers) = c("x", "y") four_centers$dataset = "four centers" dataset = rbind(beav[,c("x", "y", "dataset")], random, two_strips, many_strips, four_centers) ``` ```{r fig.width=15, fig.height=15} ggplot(data = dataset) + geom_point(aes(x = x, y = y)) + facet_wrap(~dataset, ncol=2) ``` ```{r} library(dplyr) group_by(dataset, dataset) %>% summarise("Korelacja" = cor(x, y), "Kowariancja" = cov(x, y), "Średnia X" = mean(x), "Średnia Y" = mean(y), "Odchylenie standardowe X" = sd(x), "Odchylenie standardowe Y" = sd(y), ) write.csv(dataset, "beaver_full.csv") ```