require(tidyverse)
geopandas = read.table("~/RIDIR/Code/Validation/GeoPandasBenchmark.tsv", header = F, sep = "|")
names(geopandas) = c("Timestamp", "Method", "Size", "TotalTime", "AreaTablesTime")
head(geopandas)
geospark = read.table("~/RIDIR/Code/Validation/GeoSparkBenchmark.tsv", header = F, sep = "|")
names(geospark) = c("Timestamp", "Method", "Size", "TotalTime", "AreaTablesTime")
head(geospark)
benchmark = as_tibble(rbind(geopandas, geospark)) %>%
select("Method", "Size", "TotalTime", "AreaTablesTime") %>%
mutate(TotalTime = as.numeric(TotalTime), AreaTablesTime = as.numeric(AreaTablesTime)) %>%
group_by(Method, Size) %>% summarise(TotalTime = mean(TotalTime), AreaTablesTime = mean(AreaTablesTime))
benchmark$Size <- factor(benchmark$Size, levels = c("10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "100%"))
head(benchmark)
The CA (California) dataset has 5846 polygons in the source and 7953 polygons in the target. Each 10% increase in the size of the dataset is about 584 polygons in the source and 787 polygons in the target.
g = ggplot(data=benchmark, aes(x=Size, y=AreaTablesTime, fill=Method)) +
geom_bar(stat="identity", position=position_dodge(width = 0.75), width = 0.7) +
labs(title="Benchmark area_tables using CA dataset", y="Time(s)", x="Size of CA dataset")
plot(g)
data = read_delim("/home/and/RIDIR/Code/Validation/GeoSparkFull.tsv", delim = "|", col_names = F) %>%
rename(Timestamp=X1, Method=X2, Cores=X3, Nodes=X4, Partitions=X5, Time=X6, Results=X7, Size=X8, AppID=X9) %>%
select(Size, Time) %>%
mutate(Size = paste0(Size * 100, "%"), Time = as.numeric(Time)) %>%
group_by(Size) %>% summarise(Time = mean(Time))
data$Size <- factor(data$Size, levels = c("20%", "40%", "60%", "80%", "100%"))
head(data)
The Full dataset has 72693 polygons in the source and 61332 polygons in the target. As reference, the geopandas implementation in the full dataset throws a memory error after ~45min of execution...
f = ggplot(data=data, aes(x=Size, y=Time)) +
geom_bar(stat="identity", position=position_dodge(width = 0.75 ), width = 0.7) + ylim(0, 100) +
labs(title="Benchmark GeoSpark area_tables using the Full dataset", y="Time(s)", x="Size of Full dataset")
plot(f)