๐ฎ Day 4 - Spell Solutions#
# Load our magical libraries
library(ggplot2)
library(dplyr)
library(gridExtra)
library(infer) # For rep_sample_n function
set.seed(123) # For reproducible results
Attaching package: โdplyrโ
The following objects are masked from โpackage:statsโ:
filter, lag
The following objects are masked from โpackage:baseโ:
intersect, setdiff, setequal, union
Attaching package: โgridExtraโ
The following object is masked from โpackage:dplyrโ:
combine
๐ฎ Day 4 - Spell 1: Distribution Detective Mission#
๐ฏ Mission Briefing#
Discover the magical secrets of mean, median, and mode through different creature power distributions!
๐ก What are Distributions?#
Think of distributions like different ways data can be shaped - some are balanced (symmetrical), some lean to one side (skewed), and some have unusual outliers!
# Load our magical creature dataset
creatures <- read.csv("../datasets/magical_creatures_stats.csv")
# Let's peek at our data first
head(creatures)
# Separate each creature type for easier analysis
dragons <- creatures[creatures$creature_type == "dragon", ]
unicorns <- creatures[creatures$creature_type == "unicorn", ]
phoenixes <- creatures[creatures$creature_type == "phoenix", ]
cat("We have", nrow(dragons), "dragons,", nrow(unicorns), "unicorns, and", nrow(phoenixes), "phoenixes!")
| creature_id | creature_type | power_level | |
|---|---|---|---|
| <int> | <chr> | <int> | |
| 1 | 1 | dragon | 71 |
| 2 | 2 | dragon | 42 |
| 3 | 3 | dragon | 55 |
| 4 | 4 | dragon | 59 |
| 5 | 5 | dragon | 56 |
| 6 | 6 | dragon | 48 |
We have 1000 dragons, 1000 unicorns, and 1000 phoenixes!
๐ Investigation 1: Dragon Power (Normal Distribution)#
Dragons have very balanced power - most dragons are medium strength, with fewer very weak or very strong dragons.
# Create a histogram of dragon power
ggplot(dragons, aes(x = power_level)) +
geom_histogram(fill ="#a4ce89", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Dragon Power Distribution (Normal/Symmetrical)",
x = "Dragon Power Level",
y = "Number of Dragons") +
theme_minimal()
# Calculate statistics for dragon power
dragon_stats <- dragons %>%
summarise(
mean_power = mean(power_level),
median_power = median(power_level),
min_power = min(power_level),
max_power = max(power_level)
)
print("Dragon Power Statistics:")
print(dragon_stats)
๐ก Key Learning#
In symmetrical distributions, mean and median are very close!
๐ฆ Investigation 2: Unicorn Power (Left-Skewed Distribution)#
Most unicorns have high power, but a few have very low power - this creates a โtailโ on the left.
# Create histogram of unicorn magic
ggplot(unicorns, aes(x = power_level)) +
geom_histogram(fill = "pink", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Unicorn Power Distribution (Left-Skewed)",
x = "Unicorn Power Level",
y = "Number of Unicorns") +
theme_minimal()
# Calculate statistics for unicorn power
unicorn_stats <- unicorns %>%
summarise(
mean_power = mean(power_level),
median_power = median(power_level),
min_power = min(power_level),
max_power = max(power_level)
)
print("Unicorn Power Statistics:")
print(unicorn_stats)
๐ก Key Learning#
In left-skewed distributions, the mean is PULLED DOWN by the low values!
๐ฅ Investigation 3: Phoenix Power (Right-Skewed Distribution)#
Most phoenixes have low power, but a few have extremely high power - this creates a โtailโ on the right.
# Create histogram of phoenix energy
ggplot(phoenixes, aes(x = power_level)) +
geom_histogram(fill = "#f1c979", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Phoenix Power Distribution (Right-Skewed)",
x = "Phoenix Power Level",
y = "Number of Phoenixes") +
theme_minimal()
# Calculate statistics for phoenix power
phoenix_stats <- phoenixes %>%
summarise(
mean_power = mean(power_level),
median_power = median(power_level),
min_power = min(power_level),
max_power = max(power_level)
)
print("๐ฅ Phoenix Power Statistics:")
print(phoenix_stats)
๐ก Key Learning#
In right-skewed distributions, the mean is PULLED UP by the high values!
๐จ Stacked Distribution Gallery#
Letโs see all three distributions stacked vertically in one beautiful view!
# Create individual plots
p1 <- ggplot(dragons, aes(x = power_level)) +
geom_histogram(fill ="#a4ce89", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Dragon Power (Normal Distribution)",
x = "Power Level",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 12, hjust = 0.5))
p2 <- ggplot(unicorns, aes(x = power_level)) +
geom_histogram(fill = "pink", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Unicorn Power (Left-Skewed Distribution)",
x = "Power Level",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 12, hjust = 0.5))
p3 <- ggplot(phoenixes, aes(x = power_level)) +
geom_histogram(fill = "#f1c979", color = "white", alpha = 0.7, bins = 25) +
labs(title = "Phoenix Power (Right-Skewed Distribution)",
x = "Power Level",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(size = 12, hjust = 0.5))
# Stack them in a 3x1 grid (3 rows, 1 column)
grid.arrange(p1, p2, p3, nrow = 3, ncol = 1)
๐ก What Do You Notice?#
Look at how different each shape is! Can you see the โtailsโ and where most of the data clusters?
๐ The Great Comparison Challenge#
Letโs compare all three distributions with numbers!
# Combine all the statistics
all_stats <- data.frame(
Creature = c("Dragon (Normal)", "Unicorn (Left-Skewed)", "Phoenix (Right-Skewed)"),
Mean = c(dragon_stats$mean_power, unicorn_stats$mean_power, phoenix_stats$mean_power),
Median = c(dragon_stats$median_power, unicorn_stats$median_power, phoenix_stats$median_power),
Difference = c(
abs(dragon_stats$mean_power - dragon_stats$median_power),
abs(unicorn_stats$mean_power - unicorn_stats$median_power),
abs(phoenix_stats$mean_power - phoenix_stats$median_power)
)
)
print("๐ The Great Statistics Comparison:")
print(all_stats)
[1] "๐ The Great Statistics Comparison:"
Creature Mean Median Difference
1 Dragon (Normal) 49.656 50 0.344
2 Unicorn (Left-Skewed) 80.331 82 1.669
3 Phoenix (Right-Skewed) 19.493 17 2.493
๐ก Mystery Question#
Which creature type has the biggest difference between mean and median? Why?
๐ก Take-Home Messages from Spell 1#
Symmetrical distributions: Mean โ Median (theyโre close friends!)
Left-skewed distributions: Mean < Median (mean gets pulled down)
Right-skewed distributions: Mean > Median (mean gets pulled up)
When data is skewed, median is often a better measure of โtypicalโ value
๐ Your Turn!#
Try running each code chunk and observe how the different distribution shapes affect the relationship between mean, median, and mode. Can you explain why the mean โmovesโ toward the tail in skewed distributions?
๐ฎ Day 4 - Spell 2: The Great Creature Sampling Adventure#
๐ก What is Sampling?#
Imagine you want to know how many dragons live in the entire Enchanted Forest, but you canโt count them all! So you explore small areas, count the dragons there, and use that to guess about the whole forest. Thatโs sampling!
๐ฒ Population = All the creatures in the entire Enchanted Forest (everything we want to know about)
๐ Sample = The creatures we find in one small area we explore (the small group we actually study)
๐ Statistic = Something we calculate using our sample (like โ30% of creatures in our sample are dragonsโ)
๐ฏ Inference = Using our sample to make a conclusion about the whole population
๐ Our Magical Population: The Enchanted Forest#
Letโs create our population of 100000 magical creatures where exactly 30% are dragons!
# Create our population of 100000 creatures
# 30000 dragons (30%) and 70000 other creatures (70%)
magical_creatures <- tibble(
creature_id = 1:100000,
creature_type = factor(c(rep("dragon", 30000), rep("other", 70000)))
)
# Let's verify our true population proportion
true_dragon_proportion <- sum(magical_creatures$creature_type == "dragon") / nrow(magical_creatures)
print(paste("๐ฏ True proportion of dragons in population:", true_dragon_proportion))
# Visualize our population
population_summary <- magical_creatures %>%
count(creature_type) %>%
mutate(proportion = n / sum(n))
ggplot(population_summary, aes(x = creature_type, y = proportion, fill = creature_type)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("dragon" = "orange", "other" = "gray")) +
labs(title = "True Population: Enchanted Forest Creatures",
x = "Creature Type",
y = "Proportion",
subtitle = "Population Size: 100000 creatures, 30% dragons") +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19),
legend.position = "none")
๐ฃ Taking Our First Sample (Size 40)#
Letโs explore one area of the forest and see what creatures we find!
# Take one random sample of 40 creatures
creature_sample_1 <- rep_sample_n(magical_creatures, size = 40)
head(creature_sample_1)
# What's the proportion of dragons in our sample?
dragon_estimate_1 <- creature_sample_1 %>%
count(creature_type) %>%
mutate(proportion = n / sum(n))
print("๐ Our first sample results:")
head(dragon_estimate_1)
ggplot(dragon_estimate_1, aes(x = creature_type, y = proportion, fill = creature_type)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("dragon" = "orange", "other" = "gray")) +
labs(title = "First Sample: Enchanted Forest Creatures",
x = "Creature Type",
y = "Proportion",
subtitle = "Sample Size: 40 creatures") +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19),
legend.position = "none")
๐ฃ Taking Another Sample#
What happens if we explore a different area of the forest? Will we get the same result?
# Take another random sample of 40 creatures
dragon_estimate_2 <- rep_sample_n(magical_creatures, size = 40) %>%
count(creature_type) %>%
mutate(proportion = n / sum(n))
print("๐ Our second sample results:")
head(dragon_estimate_2)
ggplot(dragon_estimate_2, aes(x = creature_type, y = proportion, fill = creature_type)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("dragon" = "orange", "other" = "gray")) +
labs(title = "Second Sample: Enchanted Forest Creatures",
x = "Creature Type",
y = "Proportion",
subtitle = "Sample Size: 40 creatures") +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19),
legend.position = "none")
๐ฃ And One More Sample!#
Letโs try once more to see the variability between samples:
# Take a third random sample of 40 creatures
dragon_estimate_3 <- rep_sample_n(magical_creatures, size = 40) %>%
count(creature_type) %>%
mutate(proportion = n / sum(n))
print("๐ Our third sample results:")
head(dragon_estimate_3)
ggplot(dragon_estimate_3, aes(x = creature_type, y = proportion, fill = creature_type)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("dragon" = "orange", "other" = "gray")) +
labs(title = "Third Sample: Enchanted Forest Creatures",
x = "Creature Type",
y = "Proportion",
subtitle = "Sample Size: 40 creatures") +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19),
legend.position = "none")
๐ The Sampling Distribution: Taking LOTS of Samples!#
What if we explore 1000 different areas of the forest? Letโs see what the distribution of our estimates looks like!
# Take 1000 samples of size 40 and calculate proportions for each
sampling_results <- rep_sample_n(magical_creatures, size = 40, reps = 300) %>%
group_by(replicate) %>%
summarize(proportion = sum(creature_type == "dragon") / 40)
mean_of_sampling <- mean(sampling_results$proportion)
sd_of_sampling <- sd(sampling_results$proportion)
# Plot the sampling distribution
ggplot(sampling_results, aes(x = proportion)) +
geom_histogram(binwidth = 0.05, fill = "#91d4eb", color = "white", alpha = 0.7) +
geom_vline(xintercept = true_dragon_proportion, color = "#9090ea", linetype = "solid", linewidth = 1) +
geom_vline(xintercept = mean_of_sampling, color = "#e1b086", linetype = "dashed", linewidth = 1) +
labs(title = "Sampling Distribution: 1000 Samples of Size 40",
x = "Sample Proportion of Dragons",
y = "Count",
subtitle = "Purple solid line shows true population proportion (0.30), Orange dashed line shows mean of sampling distribution") +
theme_minimal() +
theme(text = element_text(size = 12),
plot.title = element_text(size = 16),
axis.title = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = "none")
# Calculate statistics about our sampling distribution
print(paste("๐ Average of all sample proportions:", round(mean_of_sampling, 3)))
print(paste("๐ Spread (standard deviation) of sample proportions:", round(sd_of_sampling, 3)))
print(paste("๐ฏ True population proportion:", true_dragon_proportion))
๐ The Magic of Sample Size: What If We Change Sample Size?#
Letโs see what happens when we use different sample sizes!
# Compare different sample sizes: 20, 40, and 200
sample_sizes <- c(20, 40, 200)
all_results <- tibble()
for (size in sample_sizes) {
results <- rep_sample_n(magical_creatures, size = size, reps = 300) %>%
group_by(replicate) %>%
summarize(proportion = sum(creature_type == "dragon") / size,
sample_size = paste("Size", size))
all_results <- bind_rows(all_results, results)
}
# Order facets small โ large and compute mean per sample size for dashed lines
all_results <- all_results %>%
mutate(sample_size = factor(sample_size, levels = c("Size 20", "Size 40", "Size 200")))
sample_means <- all_results %>%
group_by(sample_size) %>%
summarize(mean_proportion = mean(proportion), .groups = "drop")
# Create comparison plot (3x1 facets, styled like the chart above)
ggplot(all_results, aes(x = proportion, fill = sample_size)) +
geom_histogram(binwidth = 0.05, color = "white", alpha = 0.7) +
geom_vline(xintercept = true_dragon_proportion, color = "#9090ea", linetype = "solid", linewidth = 1) +
geom_vline(data = sample_means, aes(xintercept = mean_proportion), color = "#e1b086", linetype = "dashed", linewidth = 1) +
facet_wrap(~sample_size, ncol = 1) +
scale_fill_manual(values = c(
"Size 20" = "#cfe8ff", # light blue
"Size 40" = "#91d4eb", # medium blue (matches above)
"Size 200" = "#4da3d9" # darker blue
)) +
labs(title = "The Magic of Sample Size: All Distributions Compared",
x = "Sample Proportion of Dragons",
y = "Count",
subtitle = "Purple line = true proportion (0.30). Orange dashed line = mean of each sampling distribution.") +
theme_minimal() +
theme(
legend.position = "none",
text = element_text(size = 12),
plot.title = element_text(size = 16),
axis.title = element_text(size = 15),
axis.text = element_text(size = 12)
)
# Calculate summary statistics
summary_stats <- all_results %>%
group_by(sample_size) %>%
summarize(
Average_Proportion = mean(proportion),
Spread_SD = sd(proportion),
.groups = "drop"
) %>%
mutate(True_Proportion = true_dragon_proportion,
Difference = abs(Average_Proportion - True_Proportion))
print("๐ Summary of All Experiments:")
print(summary_stats)
๐ฏ Key Magical Discoveries from Spell 2!#
๐ Discovery 1: The Law of Large Numbers
All sample sizes give us averages close to the true population proportion (0.30)
Larger samples donโt change the average, but they make it more reliable!
๐ Discovery 2: Sample Size and Spread
Small samples (10): High variability - samples can be quite different from each other
Medium samples (50): Medium variability - samples are more consistent
Large samples (100): Low variability - samples are very consistent and close to truth
๐ช Discovery 3: The Sampling Distribution Shape
All sampling distributions are roughly bell-shaped (normal)
They all center around the true population value
Larger sample sizes create narrower, more precise distributions
๐ก Real-World Magic Applications#
๐ณ๏ธ Political Polls: When pollsters survey 1000 people instead of 100, their predictions are much more accurate!
๐ฎ Game Testing: Game companies test with large groups to get reliable feedback about what players really think.
๐ Restaurant Reviews: A restaurant with 500 reviews gives you a better idea of quality than one with 5 reviews!
โ๏ธ Medical Research: Doctors test medicines on large groups to be confident about the results.
๐ฎ Day 4 - Spell 3: Bootstrap Bootcamp - The Great Dragon Fire Sheep Rescue#
๐ The Story: Dragon Fire Sheep Rescue! ๐๐ฅ#
URGENT QUEST ALERT! The ancient Dragon of Mount Statistics has accidentally breathed fire across the Enchanted Meadows, where Farmer Luna (white sheep) and Farmer Obsidian (black sheep) graze their magical flocks. The fire didnโt harm the sheep, but created a massive cloud of magical smoke that mixed all the flocks together!
The two farmers are worried and need to know how many of their sheep are mixed together in the smoky field. But hereโs the problem: the smoke is so thick that shepherds can only see a small group of sheep at a time!
โก Your Challenge: Use the ancient art of Bootstrap Magic to estimate what percentage of the total flock is black sheep!
๐ Creating Our Magical Sheep Population#
Letโs create our mixed flock where 60% are black sheep and 40% are white sheep!
# Create our population of 10000 magical sheep
# 6000 black sheep (60%) and 4000 white sheep (40%)
magical_sheep <- tibble(
sheep_id = 1:10000,
sheep_color = factor(c(rep("black", 6000), rep("white", 4000)))
)
# Let's verify our true population proportion
true_black_proportion <- sum(magical_sheep$sheep_color == "black") / nrow(magical_sheep)
print(paste("๐ฏ True proportion of black sheep in population:", true_black_proportion))
# Visualize our population
sheep_summary <- magical_sheep %>%
count(sheep_color) %>%
mutate(proportion = n / sum(n))
ggplot(sheep_summary, aes(x = sheep_color, y = proportion, fill = sheep_color)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("black" = "black", "white" = "lightgray")) +
labs(title = "True Population: Mixed Sheep Flock",
x = "Sheep Color",
y = "Proportion",
subtitle = "Population Size: 1000 sheep, 60% black sheep") +
theme_minimal() +
theme(legend.position = "none")
๐ก What is Bootstrapping?#
๐ The Sheep Magic Trick! You can only see 20 sheep through the smoke, but you want to know what the whole flock looks like. Bootstrapping is like having a magic hat:
๐ฉ The Magic Hat Steps:
Put your 20 sheep in the magic hat (your original sample through the smoke)
Pull one sheep out, write down its color (black or white)
PUT IT BACK in the hat! (this is the magic part - WITH REPLACEMENT)
Repeat 20 times to create a โpretend new sampleโ
Do this magic trick hundreds of times to see all possible pretend samples!
By doing this magic trick hundreds of times, you can imagine what it would be like if you could take hundreds of real samples from the smoky field!
๐ Phase 1: Your Original Vision (Taking Our First Sample)#
You can only see 20 sheep through the smoke. Letโs take our first sample!
# Take one original sample of 20 sheep that we can see through the smoke
original_sample <- rep_sample_n(magical_sheep, size = 20)
# Count black vs white sheep in our original sample
original_count <- original_sample %>%
count(sheep_color) %>%
mutate(proportion = n / sum(n))
print("๐ Our original sample through the smoke:")
print(original_count)
ggplot(original_count, aes(x = sheep_color, y = proportion, fill = sheep_color)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("black" = "black", "white" = "lightgray")) +
labs(title = "First sample through the smoke",
x = "Sheep Color",
y = "Proportion",
subtitle = "Sample Size: 20 sheep") +
theme_minimal() +
theme(legend.position = "none")
# Calculate our initial estimate
original_black_prop <- sum(original_sample$sheep_color == "black") / 20
print(paste("๐ Our estimate of black sheep percentage:", round(original_black_prop * 100, 1), "%"))
print(paste("๐ฏ True black sheep percentage:", round(true_black_proportion * 100, 1), "%"))
๐ Phase 2: The Bootstrap Magic Ritual#
Now weโll use bootstrap magic! Weโll resample from our original sample WITH REPLACEMENT to create many bootstrap samples.
# Let's try one bootstrap resample to see how it works
# We sample WITH REPLACEMENT from our original sample
bootstrap_sample_1 <- original_sample %>%
ungroup() %>%
select(sheep_color) %>%
rep_sample_n(size = 20, replace = TRUE)
bootstrap_count <- bootstrap_sample_1 %>%
count(sheep_color) %>%
mutate(proportion = n / sum(n))
ggplot(bootstrap_count, aes(x = sheep_color, y = proportion, fill = sheep_color)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("black" = "black", "white" = "lightgray")) +
labs(title = "First bootstrap sample",
x = "Sheep Color",
y = "Proportion",
subtitle = "BootstrapSample Size: 20 sheep (with replacement)") +
theme_minimal() +
theme(legend.position = "none")
bootstrap_prop_1 <- sum(bootstrap_sample_1$sheep_color == "black") / 20
print(paste("๐ฒ Bootstrap sample 1 - black sheep proportion:", round(bootstrap_prop_1, 3)))
# Let's try another bootstrap sample
bootstrap_sample_2 <- original_sample %>%
ungroup() %>%
select(sheep_color) %>%
rep_sample_n(size = 20, replace = TRUE)
bootstrap_count <- bootstrap_sample_2 %>%
count(sheep_color) %>%
mutate(proportion = n / sum(n))
ggplot(bootstrap_count, aes(x = sheep_color, y = proportion, fill = sheep_color)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("black" = "black", "white" = "lightgray")) +
labs(title = "๐ Second bootstrap sample",
x = "Sheep Color",
y = "Proportion",
subtitle = "BootstrapSample Size: 20 sheep (with replacement)") +
theme_minimal() +
theme(legend.position = "none")
bootstrap_prop_2 <- sum(bootstrap_sample_2$sheep_color == "black") / 20
print(paste("๐ฒ Bootstrap sample 2 - black sheep proportion:", round(bootstrap_prop_2, 3)))
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <f0>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <9f>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <90>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <91>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <f0>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <9f>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <90>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ Second bootstrap sample' in 'mbcsToSbcs': dot substituted for <91>โ
[1] "๐ฒ Bootstrap sample 2 - black sheep proportion: 0.65"
# And one more to see the variability
bootstrap_sample_3 <- original_sample %>%
ungroup() %>%
select(sheep_color) %>%
rep_sample_n(size = 20, replace = TRUE)
bootstrap_count <- bootstrap_sample_3 %>%
count(sheep_color) %>%
mutate(proportion = n / sum(n))
ggplot(bootstrap_count, aes(x = sheep_color, y = proportion, fill = sheep_color)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("black" = "black", "white" = "lightgray")) +
labs(title = "Third bootstrap sample",
x = "Sheep Color",
y = "Proportion",
subtitle = "BootstrapSample Size: 20 sheep (with replacement)") +
theme_minimal() +
theme(legend.position = "none")
bootstrap_prop_3 <- sum(bootstrap_sample_3$sheep_color == "black") / 20
print(paste("๐ฒ Bootstrap sample 3 - black sheep proportion:", round(bootstrap_prop_3, 3)))
๐ The Full Bootstrap Distribution: 1000 Magic Resamples!#
Now letโs create many bootstrap samples to see the full distribution of possible estimates!
# Create 1000 bootstrap samples from our original sample
original_sample_clean <- original_sample %>%
ungroup() %>%
select(sheep_color)
bootstrap_results <- original_sample_clean %>%
rep_sample_n(size = 20, replace = TRUE, reps = 1000) %>%
group_by(replicate) %>%
summarize(black_proportion = sum(sheep_color == "black") / 20,
.groups = "drop")
# Calculate statistics about our bootstrap distribution
bootstrap_mean <- mean(bootstrap_results$black_proportion)
bootstrap_sd <- sd(bootstrap_results$black_proportion)
# Plot the bootstrap distribution
ggplot(bootstrap_results, aes(x = black_proportion)) +
geom_histogram(binwidth = 0.05, fill = "#89e5ed", color = "white", alpha = 0.7) +
geom_vline(xintercept = original_black_prop, color = "#ce8ff0", linetype = "dashed", linewidth = 2) +
geom_vline(xintercept = true_black_proportion, color = "#ea9d69", linetype = "solid", linewidth = 2) +
geom_vline(xintercept = bootstrap_mean, color = "#62aa55", linetype = "longdash", linewidth = 2) +
labs(title = "Bootstrap Distribution: 1000 Magic Resamples",
x = "Bootstrap Sample Proportion of Black Sheep",
y = "Count",
subtitle = "Purple line: Our original sample estimate, \n Orange line: True proportion, \n Green line: Mean of bootstrap distribution") +
theme_minimal()
print(paste("๐ Average of bootstrap proportions:", round(bootstrap_mean, 3)))
print(paste("๐ Spread (standard deviation) of bootstrap proportions:", round(bootstrap_sd, 3)))
print(paste("๐ Our original sample proportion:", round(original_black_prop, 3)))
print(paste("๐ฏ True population proportion:", round(true_black_proportion, 3)))
๐ Phase 3: Creating Our Confidence Prophecy (Confidence Interval)#
๐ฏ Your Confidence Prophecy! After doing our magic hat trick many times, we noticed that most of our โpretend samplesโ gave us similar results. Now we can tell the farmers: โBased on all our magic hat tricks, Iโm 90% confident that the true percentage of black sheep in your mixed flock is somewhere in this range!โ
Itโs like saying โIโm pretty sure the answer is somewhere between these two numbers, but I canโt be 100% certain because I could only see a small part of the flock through the smoke.โ
# Calculate 90% confidence interval
# This means we're 90% confident the true proportion is in this range
confidence_level <- 0.90
alpha <- 1 - confidence_level
lower_percentile <- (alpha / 2) * 100
upper_percentile <- (1 - alpha / 2) * 100
confidence_interval <- quantile(bootstrap_results$black_proportion,
probs = c(alpha/2, 1 - alpha/2))
print("Our 90% Confidence Prophecy (Confidence Interval):")
print(paste("We are 90% confident that the true proportion of black sheep is between",
round(confidence_interval[1], 3), "and", round(confidence_interval[2], 3)))
# Did our confidence interval capture the true value?
captured_truth <- (true_black_proportion >= confidence_interval[1]) &
(true_black_proportion <= confidence_interval[2])
if (captured_truth) {
print("๐ SUCCESS! Our confidence interval captured the true proportion!")
} else {
print("๐
Oops! Our confidence interval missed the true proportion this time.")
}
# Visualize our confidence interval
ggplot(bootstrap_results, aes(x = black_proportion)) +
geom_histogram(binwidth = 0.05, fill = "#89e5ed", color = "white", alpha = 0.7) +
geom_vline(xintercept = original_black_prop, color = "#ce8ff0", linetype = "dashed", linewidth = 2) +
geom_vline(xintercept = true_black_proportion, color = "#ea9d69", linetype = "solid", linewidth = 2) +
geom_vline(xintercept = bootstrap_mean, color = "#62aa55", linetype = "longdash", linewidth = 2) +
geom_vline(xintercept = confidence_interval[1], color = "#9ecb97", linetype = "dotted", linewidth = 1) +
geom_vline(xintercept = confidence_interval[2], color = "#9ecb97", linetype = "dotted", linewidth = 1) +
labs(title = "๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy",
x = "Bootstrap Sample Proportion of Black Sheep",
y = "Count",
subtitle = "Purple line: Our original sample estimate, \n Orange line: True proportion, \n Green line: Mean of bootstrap distribution \n Green dotted lines: 90% Confidence Prophecy") +
theme_minimal()
[1] "Our 90% Confidence Prophecy (Confidence Interval):"
[1] "We are 90% confident that the true proportion of black sheep is between 0.55 and 0.85"
[1] "๐ SUCCESS! Our confidence interval captured the true proportion!"
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <f0>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <9f>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <94>โ
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <ae>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <f0>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <9f>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <94>โ
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
โconversion failure on '๐ฎ Bootstrap Distribution: 1000 Magic Resamples with 90% Confidence Prophecy' in 'mbcsToSbcs': dot substituted for <ae>โ
๐ฒ Experiment: What If We Had Different Sample Sizes?#
Letโs see how our confidence gets better with larger samples!
# Function to create bootstrap CI for different sample sizes
create_bootstrap_ci <- function(sample_size, n_bootstrap = 500) {
# Take original sample of given size
sample_data <- rep_sample_n(magical_sheep, size = sample_size)
sample_clean <- sample_data %>% ungroup() %>% select(sheep_color)
# Create bootstrap distribution
bootstrap_dist <- sample_clean %>%
rep_sample_n(size = sample_size, replace = TRUE, reps = n_bootstrap) %>%
group_by(replicate) %>%
summarize(black_proportion = sum(sheep_color == "black") / sample_size,
.groups = "drop")
# Calculate 90% CI
ci <- quantile(bootstrap_dist$black_proportion, probs = c(0.05, 0.95))
return(list(
sample_size = sample_size,
ci_lower = ci[1],
ci_upper = ci[2],
ci_width = ci[2] - ci[1],
bootstrap_dist = bootstrap_dist
))
}
# Compare different sample sizes
sample_sizes <- c(10, 20, 50, 200, 500)
ci_results <- tibble()
for (size in sample_sizes) {
result <- create_bootstrap_ci(size)
new_row <- tibble(
sample_size = result$sample_size,
ci_lower = result$ci_lower,
ci_upper = result$ci_upper,
ci_width = result$ci_width
)
ci_results <- bind_rows(ci_results, new_row)
}
print("๐ฌ How Sample Size Affects Confidence Interval Width:")
print(ci_results)
# Visualize the effect of sample size
ggplot(ci_results, aes(x = factor(sample_size))) +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), linewidth = 3, color = "#9ecb97") +
geom_hline(yintercept = true_black_proportion, color = "#ea9d69", linetype = "dashed") +
labs(title = "Magic of Sample Size: Confidence Intervals \nGet Narrower!",
x = "Sample Size",
y = "90% Confidence Interval",
subtitle = "Orange line: True proportion. Green lines: Confidence intervals") +
theme_minimal()
๐ฎ Day 4 - Ice Breaker Activity: Human Histogram & Statistics Discovery ๐#
This example shows you how to:
Load messy height data
Clean and fix common data problems
Calculate min, max, median, and mean
Handle missing or weird values
# ๐ Step 1: Load the data
########################################################
# Load the height data from our CSV file
height_data <- read.csv("../datasets/height.csv")
# ๐งน Step 2: Clean the data
########################################################
# only keep the second and third columns
height_data <- height_data[, c(2:3)]
# The height column has a very long name! Let's rename it
colnames(height_data)[1] <- "wizard_name"
colnames(height_data)[2] <- "height_cm"
# Let's see the updated column names
print("New column names:")
print(colnames(height_data))
# Convert height to numeric (in case it's stored as text)
height_data$height_cm <- as.numeric(height_data$height_cm)
# Check for any weird values or missing data
print("Summary of height data:")
summary(height_data$height_cm)
[1] "New column names:"
[1] "wizard_name" "height_cm"
Warning message:
โNAs introduced by coercionโ
[1] "Summary of height data:"
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.00e+00 1.35e+02 1.58e+02 4.00e+22 1.72e+02 1.00e+24 2
# ๐ Step 3: Data quality checks
########################################################
# Check for missing values (NA)
missing_count <- sum(is.na(height_data$height_cm))
print(paste("Number of missing heights:", missing_count))
# Check for unrealistic heights (too small or too large)
# Kids are usually between 100-200 cm tall
weird_heights <- height_data$height_cm[height_data$height_cm < 30 | height_data$height_cm > 190]
if(length(weird_heights) > 0) {
print("Warning: Found some unusual heights:")
print(weird_heights)
}
[1] "Number of missing heights: 2"
[1] "Warning: Found some unusual heights:"
[1] 5e+00 NA 6e+00 1e+00 3e+00 1e+24 NA 2e+02
# ๐ Step 4: Calculate basic statistics
########################################################
# Remove any missing values for our calculations
clean_heights <- height_data$height_cm[!is.na(height_data$height_cm)]
# Calculate the basic statistics
min_height <- min(clean_heights)
max_height <- max(clean_heights)
mean_height <- mean(clean_heights)
median_height <- median(clean_heights)
# Print our results in a nice format
print("๐ HEIGHT STATISTICS ๐")
print("========================")
print(paste("๐ Minimum height:", min_height, "cm"))
print(paste("๐ Maximum height:", max_height, "cm"))
print(paste("๐ Average height:", round(mean_height, 2), "cm"))
print(paste("๐ Median height:", median_height, "cm"))
print(paste("๐ฅ Total people measured:", length(clean_heights)))
[1] "๐ HEIGHT STATISTICS ๐"
[1] "========================"
[1] "๐ Minimum height: 1 cm"
[1] "๐ Maximum height: 1e+24 cm"
[1] "๐ Average height: 4e+22 cm"
[1] "๐ Median height: 158 cm"
[1] "๐ฅ Total people measured: 25"
๐ฎ Day 4 - Game 1: The Enchanted Halloween Neighborhood Mystery๐#
The Mgaic Council of Halloween has received reports that the Shadow Monster has been cursing neighborhoods, turning precious candies into worthless shadow stones! As young Statistical Wizards, your mission is to determine if the Enchanted Forest Neighborhood is safe for trick-or-treating.
The neighborhood has a magical protection spell - if it contains 60% or more real candies (not shadow stones), itโs safe to visit. But beware! You only have 3 minutes of magical sight to examine your sample before the spell fades.
Round 1#
## ๐งน Step 1: Set Parameters and Load Round 1 Data
library(dplyr)
library(ggplot2)
# ๐ฏ Set the maximum number of candies possible in each bag
MAX_CANDIES_PER_BAG <- 10 # Change this if needed!
# ๐ฐ Set the safety threshold (60% or more means SAFE)
SAFETY_THRESHOLD <- 60
cat("๐ Maximum candies per bag:", MAX_CANDIES_PER_BAG, "\n")
cat("๐ก๏ธ Safety threshold:", SAFETY_THRESHOLD, "% (>= this percentage means SAFE)\n")
๐ Maximum candies per bag: 10
๐ก๏ธ Safety threshold: 60 % (>= this percentage means SAFE)
# ๐ก Load Round 1 candy count data
# Make sure this CSV file is in your datasets folder!
# Round 1 data (replace with your actual file name)
round1_data <- read.csv("../datasets/candy_round1.csv")
# ๐งน Clean Round 1 data - use only columns 2, 3, 4 and clean data types
# Function to clean candy data
clean_candy_data <- function(data, round_name, candy_max = MAX_CANDIES_PER_BAG) {
# Select only columns 2, 3, 4 and rename them
cleaned <- data %>%
select(
wizard_name = 2, # Column 2: wizard name
candy_count_raw = 3, # Column 3: candy count
decision = 4 # Column 4: decision
) %>%
# Convert candy count to numeric and clean invalid entries
mutate(
candy_count = as.numeric(candy_count_raw),
round = round_name
) %>%
# Remove rows with invalid candy counts (NA, negative, or above maximum)
filter(
!is.na(candy_count),
candy_count >= 0,
candy_count <= candy_max
) %>%
# Calculate candy percentage
mutate(
candy_percentage = candy_count / candy_max * 100
) %>%
# Keep only the columns we need
select(wizard_name, candy_count, candy_percentage, decision, round)
return(cleaned)
}
# Clean Round 1 dataset
round1_clean <- clean_candy_data(round1_data, "Round 1")
cat("โจ Round 1 Cleaned Data Summary:\n")
cat("Total wizards:", nrow(round1_clean), "\n")
cat("Valid candy counts range:", min(round1_clean$candy_count), "to", max(round1_clean$candy_count), "\n")
โจ Round 1 Cleaned Data Summary:
Total wizards: 21
Valid candy counts range: 1 to 10
Sampling Distribution for Round 1#
# ๐ Create a histogram showing everyone's candy percentages from Round 1
round1_avg <- mean(round1_clean$candy_percentage)
hist_round1 <-ggplot(round1_clean, aes(x = candy_percentage)) +
geom_histogram(fill = "pink", color = "white", alpha = 0.7) +
geom_vline(xintercept = round1_avg, color = "orange", linewidth = 2) +
labs(
title = "Round 1: Sampling Distribution",
subtitle = paste("Orange line = Sample average (", round(round1_avg, 1), "%)"),
x = "Candy Percentage (%)",
y = "Number of Wizards"
) +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19))
print(hist_round1)
Calculate Round 1 Statistics#
# ๐ Calculate important statistics for Round 1
round1_std <- sd(round1_clean$candy_percentage)
cat("Round 1 Results:\n")
cat("Sample average:", round(round1_avg, 2), "%\n")
cat("Sample standard deviation:", round(round1_std, 2), "\n")
cat("Number of wizards:", nrow(round1_clean), "\n")
cat("Range of candy counts:", min(round1_clean$candy_count), "to", max(round1_clean$candy_count), "candies\n")
Round 1 Results:
Sample average: 54.29 %
Sample standard deviation: 21.81
Number of wizards: 21
Range of candy counts: 1 to 10 candies
Round 1 Predictions Analysis#
# ๐ Analyze the predictions made by wizards in Round 1
round1_with_predictions <- round1_clean %>%
mutate(
predicted_answer = case_when(
grepl("SAFE", decision, ignore.case = TRUE) ~ "SAFE",
grepl("CURSED", decision, ignore.case = TRUE) ~ "CURSED",
TRUE ~ "UNKNOWN"
)
)
# Count SAFE vs CURSED predictions
round1_summary <- round1_with_predictions %>%
count(predicted_answer) %>%
filter(predicted_answer %in% c("SAFE", "CURSED"))
cat("๐ Round 1 Prediction Summary:\n")
for(i in seq_len(nrow(round1_summary))) {
cat(round1_summary$predicted_answer[i], ":", round1_summary$n[i], "wizards\n")
}
# Show individual results
#cat("\n๐งโโ๏ธ Individual Wizard Results (Round 1):\n")
#for(i in seq_len(nrow(round1_with_predictions))) {
# cat("โญ", round1_with_predictions$wizard_name[i], "- Found",
# round1_with_predictions$candy_count[i], "candies (",
# round(round1_with_predictions$candy_percentage[i], 1), "%) - Predicted:",
# round1_with_predictions$predicted_answer[i], "\n")
#}
๐ Round 1 Prediction Summary:
CURSED : 10 wizards
SAFE : 11 wizards
Round 2#
Load Round 2 Data#
# ๐ก Load Round 2 candy count data
round2_data <- read.csv("../datasets/candy_round2.csv")
MAX_CANDIES_PER_BAG <- 20
# Clean Round 2 dataset using the same function
round2_clean <- clean_candy_data(round2_data, "Round 2", candy_max = MAX_CANDIES_PER_BAG)
# Remove wizards with very low candy counts for better analysis
round2_clean <- round2_clean[which(round2_clean$candy_count > 1),]
cat("\nโจ Round 2 Cleaned Data Summary:\n")
cat("Total wizards:", nrow(round2_clean), "\n")
cat("Valid candy counts range:", min(round2_clean$candy_count), "to", max(round2_clean$candy_count), "\n")
โจ Round 2 Cleaned Data Summary:
Total wizards: 22
Valid candy counts range: 4 to 14
Sampling Distribution for Round 2#
# ๐ Create a histogram showing everyone's candy percentages from Round 2
round2_avg <- mean(round2_clean$candy_percentage)
hist_round2 <- ggplot(round2_clean, aes(x = candy_percentage)) +
geom_histogram(fill = "#6cb2e0", color = "white", alpha = 0.7) +
geom_vline(xintercept = round2_avg, color = "purple", linewidth = 2) +
labs(
title = "Round 2: Sampling Distribution",
subtitle = paste("Purple line = Sample average (", round(round2_avg, 1), "%)"),
x = "Candy Percentage (%)",
y = "Number of Wizards"
) +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19))
print(hist_round2)
Calculate Round 2 Statistics#
# ๐ Calculate important statistics for Round 2
round2_std <- sd(round2_clean$candy_percentage)
cat("Round 2 Results:\n")
cat("Sample average:", round(round2_avg, 2), "%\n")
cat("Sample standard deviation:", round(round2_std, 2), "\n")
cat("Number of wizards:", nrow(round2_clean), "\n")
cat("Range of candy counts:", min(round2_clean$candy_count), "to", max(round2_clean$candy_count), "candies\n")
Round 2 Results:
Sample average: 46.99 %
Sample standard deviation: 12.08
Number of wizards: 22
Range of candy counts: 4 to 14 candies
Round 2 Predictions Analysis#
# ๐ Analyze the predictions made by wizards in Round 2
round2_with_predictions <- round2_clean %>%
mutate(
predicted_answer = case_when(
grepl("SAFE", decision, ignore.case = TRUE) ~ "SAFE",
grepl("CURSED", decision, ignore.case = TRUE) ~ "CURSED",
TRUE ~ "UNKNOWN"
)
)
# Count SAFE vs CURSED predictions
round2_summary <- round2_with_predictions %>%
count(predicted_answer) %>%
filter(predicted_answer %in% c("SAFE", "CURSED"))
cat("๐ Round 2 Prediction Summary:\n")
for(i in seq_len(nrow(round2_summary))) {
cat(round2_summary$predicted_answer[i], ":", round2_summary$n[i], "wizards\n")
}
# Show individual results
#cat("\n๐งโโ๏ธ Individual Wizard Results (Round 2):\n")
#for(i in seq_len(nrow(round2_with_predictions))) {
# cat("โญ", round2_with_predictions$wizard_name[i], "- Found",
# round2_with_predictions$candy_count[i], "candies (",
# round(round2_with_predictions$candy_percentage[i], 1), "%) - Predicted:",
# round2_with_predictions$predicted_answer[i], "\n")
#}
๐ Round 2 Prediction Summary:
CURSED : 15 wizards
SAFE : 7 wizards
Overlay the Distributions#
# ๐จ Combine both datasets for comparison
combined_data <- bind_rows(round1_clean, round2_clean)
# Create overlapping histograms
hist_combined <- ggplot(combined_data, aes(x = candy_percentage, fill = round)) +
geom_histogram(alpha = 0.6, position = "identity", color = "white") +
geom_vline(xintercept = round1_avg, color = "orange", linewidth = 3, linetype = "dotted") +
geom_vline(xintercept = round2_avg, color = "purple", linewidth = 3, linetype = "dotted") +
scale_fill_manual(values = c("Round 1" = "pink", "Round 2" = "#6cb2e0")) +
labs(
title = "Comparison: Round 1 vs Round 2",
subtitle = "Orange dotted = Round 1 avg, Purple dotted = Round 2 avg",
x = "Candy Percentage (%)",
y = "Number of Wizards",
fill = "Round"
) +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19))
print(hist_combined)
Basic Statistical Comparison#
cat("๐ COMPARISON OF BOTH ROUNDS:\n")
cat(paste(rep("=", 40), collapse = ""), "\n")
cat("๐ Sample Averages:\n")
cat("Round 1 average:", round(round1_avg, 2), "%\n")
cat("Round 2 average:", round(round2_avg, 2), "%\n")
cat("\n๐ Sample Spreads (Standard Deviation):\n")
cat("Round 1 spread:", round(round1_std, 2), "%\n")
cat("Round 2 spread:", round(round2_std, 2), "%\n")
cat("\n๐ฏ Sample Sizes:\n")
cat("Round 1:", nrow(round1_clean), "wizards\n")
cat("Round 2:", nrow(round2_clean), "wizards\n")
# Basic comparison without true value
avg_difference <- abs(round1_avg - round2_avg)
cat("\n๐ Difference between round averages:", round(avg_difference, 2), "%\n")
if(round1_std < round2_std) {
cat("๐ Round 1 had less spread (more consistent results)\n")
} else if(round2_std < round1_std) {
cat("๐ Round 2 had less spread (more consistent results)\n")
} else {
cat("๐ Both rounds had similar spread\n")
}
๐ COMPARISON OF BOTH ROUNDS:
========================================
๐ Sample Averages:
Round 1 average: 54.29 %
Round 2 average: 46.99 %
๐ Sample Spreads (Standard Deviation):
Round 1 spread: 21.81 %
Round 2 spread: 12.08 %
๐ฏ Sample Sizes:
Round 1: 21 wizards
Round 2: 22 wizards
๐ Difference between round averages: 7.29 %
๐ Round 2 had less spread (more consistent results)
The Big Reveal - True Population Parameter!#
# ๐ฏ NOW we reveal the TRUE candy percentage for the Enchanted Forest Neighborhood
TRUE_CANDY_PERCENTAGE <- 50 # Change this if needed!
# ๐ฐ Determine the correct answer based on the 60% safety threshold
CORRECT_ANSWER <- ifelse(TRUE_CANDY_PERCENTAGE >= 60, "SAFE", "CURSED")
cat("๐ฎ THE BIG REVEAL!\n")
cat(paste(rep("=", 50), collapse = ""), "\n")
cat("๐ฐ TRUE candy percentage in the neighborhood:", TRUE_CANDY_PERCENTAGE, "%\n")
cat("๐ CORRECT answer should be:", CORRECT_ANSWER, "\n")
cat("๐ก๏ธ (Remember: >= ", SAFETY_THRESHOLD, "% means SAFE, < ", SAFETY_THRESHOLD, "% means CURSED)\n")
๐ฎ THE BIG REVEAL!
==================================================
๐ฐ TRUE candy percentage in the neighborhood: 50 %
๐ CORRECT answer should be: CURSED
๐ก๏ธ (Remember: >= 60 % means SAFE, < 60 % means CURSED)
Updated Comparison with Truth Revealed#
# Create the overlay plot WITH the true value line
hist_true <- ggplot(combined_data, aes(x = candy_percentage, fill = round)) +
geom_histogram(alpha = 0.6, position = "identity", color = "white") +
geom_vline(xintercept = TRUE_CANDY_PERCENTAGE, color = "#dc4d4d", linewidth = 3, linetype = "dashed") +
geom_vline(xintercept = round1_avg, color = "orange", linewidth = 2, linetype = "dotted") +
geom_vline(xintercept = round2_avg, color = "purple", linewidth = 2, linetype = "dotted") +
scale_fill_manual(values = c("Round 1" = "pink", "Round 2" = "#6cb2e0")) +
labs(
title = "Comparison: Round 1 vs Round 2 vs TRUTH",
subtitle = paste("Red = True value (", TRUE_CANDY_PERCENTAGE, "%), Orange dotted = Round 1 avg, Purple solid = Round 2 avg"),
x = "Candy Percentage (%)",
y = "Number of Wizards",
fill = "Round"
) +
theme_minimal() +
theme(text = element_text(size = 16),
plot.title = element_text(size = 20),
axis.title = element_text(size = 18),
axis.text = element_text(size = 19))
print(hist_true)
# Calculate how close each round was to the truth
round1_error <- abs(round1_avg - TRUE_CANDY_PERCENTAGE)
round2_error <- abs(round2_avg - TRUE_CANDY_PERCENTAGE)
cat("\n๐ฎ ACCURACY ANALYSIS:\n")
cat("Round 1 average:", round(round1_avg, 2), "% (difference from truth:", round(round1_error, 2), "%)\n")
cat("Round 2 average:", round(round2_avg, 2), "% (difference from truth:", round(round2_error, 2), "%)\n")
if(round1_error < round2_error) {
winner_round <- "Round 1"
better_error <- round1_error
} else if(round2_error < round1_error) {
winner_round <- "Round 2"
better_error <- round2_error
} else {
winner_round <- "TIE"
better_error <- round1_error
}
cat("\n๐ WINNER: ", winner_round, " was closer to the true population parameter!\n")
if(winner_round != "TIE") {
cat("๐ The winning round was only", round(better_error, 2), "% away from the truth!\n")
}
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
๐ฎ ACCURACY ANALYSIS:
Round 1 average: 54.29 % (difference from truth: 4.29 %)
Round 2 average: 46.99 % (difference from truth: 3.01 %)
๐ WINNER: Round 2 was closer to the true population parameter!
๐ The winning round was only 3.01 % away from the truth!
Who Actually Got It Right?#
# Now we can determine who actually got the predictions right!
# Round 1 winners
round1_final <- round1_with_predictions %>%
mutate(got_it_right = predicted_answer == CORRECT_ANSWER)
round1_winners <- round1_final %>%
filter(got_it_right == TRUE)
#cat("๐ ROUND 1 WINNERS (Wizards who got it right):\n")
#if(nrow(round1_winners) > 0) {
# for(i in seq_len(nrow(round1_winners))) {
# cat("โญ", round1_winners$wizard_name[i], "- Found", round1_winners$candy_count[i], "candies - Predicted:", round1_winners$predicted_answer[i], "\n")
# }
#} else {
# cat("No wizards got it exactly right in Round 1!\n")
#}
# Round 2 winners
round2_final <- round2_with_predictions %>%
mutate(got_it_right = predicted_answer == CORRECT_ANSWER)
round2_winners <- round2_final %>%
filter(got_it_right == TRUE)
#cat("\n๐ ROUND 2 WINNERS (Wizards who got it right):\n")
#if(nrow(round2_winners) > 0) {
# for(i in seq_len(nrow(round2_winners))) {
# cat("โญ", round2_winners$wizard_name[i], "- Found", round2_winners$candy_count[i], "candies - Predicted:", round2_winners$predicted_answer[i], "\n")
# }
#} else {
# cat("No wizards got it exactly right in Round 2!\n")
#}
#total_winners <- nrow(round1_winners) + nrow(round2_winners)
#cat("\n๐ TOTAL PREDICTION WINNERS ACROSS BOTH ROUNDS:", total_winners, "wizards!\n")