Statistics for Data Science: The Essential Foundation

Why Statistics Matters

Statistics is the foundation of data science. Every model evaluation, A/B test, and data-driven decision relies on statistical principles. Understanding statistics helps you avoid common pitfalls and make sound conclusions from data.

Machine learning is essentially applied statistics - regression, classification, and clustering all have deep statistical roots.

Descriptive Statistics

import pandas as pd
import numpy as np

# Central tendency
mean = df['salary'].mean()
median = df['salary'].median()
mode = df['salary'].mode()[0]

# Spread
std = df['salary'].std()
variance = df['salary'].var()
range_val = df['salary'].max() - df['salary'].min()
iqr = df['salary'].quantile(0.75) - df['salary'].quantile(0.25)

# Shape
skewness = df['salary'].skew()  # 0 = symmetric
kurtosis = df['salary'].kurtosis()  # 0 = normal

# Summary statistics
print(df.describe())

# Correlation
correlation = df['salary'].corr(df['experience'])
corr_matrix = df.corr()

Probability Distributions

from scipy import stats
import numpy as np

# Normal Distribution
normal = stats.norm(loc=0, scale=1)  # mean=0, std=1
print(normal.pdf(0))      # Probability density at x=0
print(normal.cdf(1.96))   # P(X <= 1.96) = 0.975

# Generate random samples
samples = np.random.normal(100, 15, 1000)  # mean=100, std=15

# Binomial Distribution (n trials, p probability)
binomial = stats.binom(n=10, p=0.5)
print(binomial.pmf(5))    # P(X = 5)

# Poisson Distribution (events per interval)
poisson = stats.poisson(mu=5)  # average 5 events

# Check if data is normally distributed
stat, p_value = stats.normaltest(data)
if p_value > 0.05:
    print("Data appears normally distributed")

Hypothesis Testing

The framework for making data-driven decisions:

State null hypothesis (H0) and alternative (H1)
Choose significance level (alpha, typically 0.05)
Calculate test statistic
Find p-value
Reject H0 if p-value < alpha

from scipy import stats

# T-test: Compare means of two groups
group_a = [23, 25, 28, 22, 30]
group_b = [19, 21, 24, 18, 22]

# Independent t-test
t_stat, p_value = stats.ttest_ind(group_a, group_b)
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

if p_value < 0.05:
    print("Significant difference between groups")

# Paired t-test (before/after)
before = [150, 155, 160, 148, 165]
after = [145, 150, 155, 142, 158]
t_stat, p_value = stats.ttest_rel(before, after)

# Chi-square test (categorical variables)
observed = [[50, 30], [20, 40]]
chi2, p_value, dof, expected = stats.chi2_contingency(observed)

# ANOVA (compare multiple groups)
f_stat, p_value = stats.f_oneway(group_a, group_b, group_c)

Confidence Intervals

import numpy as np
from scipy import stats

data = np.array([23, 25, 28, 22, 30, 27, 24, 26])

# 95% confidence interval for the mean
mean = np.mean(data)
sem = stats.sem(data)  # Standard error of mean
ci = stats.t.interval(0.95, len(data)-1, loc=mean, scale=sem)

print(f"Mean: {mean:.2f}")
print(f"95% CI: ({ci[0]:.2f}, {ci[1]:.2f})")

# Bootstrap confidence interval
def bootstrap_ci(data, n_bootstrap=10000, ci=0.95):
    boot_means = []
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=len(data), replace=True)
        boot_means.append(np.mean(sample))

    lower = np.percentile(boot_means, (1-ci)/2 * 100)
    upper = np.percentile(boot_means, (1+ci)/2 * 100)
    return lower, upper

A/B Testing

import numpy as np
from scipy import stats

# Example: Testing conversion rates
control_visitors = 1000
control_conversions = 50
treatment_visitors = 1000
treatment_conversions = 65

# Conversion rates
control_rate = control_conversions / control_visitors
treatment_rate = treatment_conversions / treatment_visitors

print(f"Control: {control_rate:.2%}")
print(f"Treatment: {treatment_rate:.2%}")
print(f"Lift: {(treatment_rate - control_rate) / control_rate:.2%}")

# Two-proportion z-test
from statsmodels.stats.proportion import proportions_ztest

count = np.array([treatment_conversions, control_conversions])
nobs = np.array([treatment_visitors, control_visitors])

z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')
print(f"p-value: {p_value:.4f}")

# Required sample size calculation
from statsmodels.stats.power import NormalIndPower

effect_size = 0.1  # Expected improvement
power = 0.8  # 80% power
alpha = 0.05

analysis = NormalIndPower()
sample_size = analysis.solve_power(effect_size, power=power, alpha=alpha)
print(f"Required sample size per group: {int(sample_size)}")

Bayesian Statistics

# Bayesian A/B testing
import numpy as np

def bayesian_ab_test(a_success, a_total, b_success, b_total, n_samples=100000):
    """
    Compare two proportions using Bayesian approach
    Uses Beta distribution as conjugate prior
    """
    # Posterior distributions (Beta)
    a_samples = np.random.beta(a_success + 1, a_total - a_success + 1, n_samples)
    b_samples = np.random.beta(b_success + 1, b_total - b_success + 1, n_samples)

    # P(B > A)
    prob_b_better = (b_samples > a_samples).mean()

    # Expected lift
    lift = ((b_samples - a_samples) / a_samples).mean()

    return prob_b_better, lift

prob, lift = bayesian_ab_test(50, 1000, 65, 1000)
print(f"P(B > A): {prob:.2%}")
print(f"Expected lift: {lift:.2%}")

Common Pitfalls

p-hacking: Testing many hypotheses until one is significant
Confusing correlation with causation: Correlation doesn't imply causation
Small sample sizes: Results may not generalize
Ignoring multiple comparisons: Use Bonferroni correction
Stopping early: Wait for planned sample size in A/B tests
Selection bias: Non-random samples mislead

Master Statistics with Expert Mentorship

Our Data Science program covers statistics from fundamentals to advanced topics. Build a strong foundation with hands-on practice and expert guidance.

Explore Data Science Program

Statistics for Data Science

Why Statistics Matters

Descriptive Statistics

Probability Distributions

Hypothesis Testing

Confidence Intervals

A/B Testing

Bayesian Statistics

Common Pitfalls

Master Statistics with Expert Mentorship

Related Articles

Machine Learning Fundamentals

Feature Engineering

Time Series Analysis