import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set(style="whitegrid")

from sklearn.datasets import load_iris

data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df["species"] = data.target

df.head()

mean_value = df["sepal length (cm)"].mean()
median_value = df["sepal length (cm)"].median()
mode_value = df["sepal length (cm)"].mode()[0]

mean_value, median_value, mode_value

(np.float64(5.843333333333334), 5.8, np.float64(5.0))

plt.figure(figsize=(8,5))
sns.histplot(df["sepal length (cm)"], bins=15, kde=True)

plt.axvline(mean_value, color='red', linestyle='--', label=f'Mean ({mean_value:.2f})')
plt.axvline(median_value, color='green', linestyle='--', label=f'Median ({median_value:.2f})')
plt.axvline(mode_value, color='blue', linestyle='--', label=f'Mode ({mode_value:.2f})')

plt.title("Mean, Median, and Mode")
plt.legend()
plt.show()

variance = df["sepal length (cm)"].var()
std_dev = df["sepal length (cm)"].std()
iqr = np.percentile(df["sepal length (cm)"], 75) - np.percentile(df["sepal length (cm)"], 25)

variance, std_dev, iqr

(0.6856935123042505, 0.8280661279778629, np.float64(1.3000000000000007))

skewness = df["sepal length (cm)"].skew()
kurtosis = df["sepal length (cm)"].kurtosis()

skewness, kurtosis

(np.float64(0.3149109566369728), np.float64(-0.5520640413156395))

t_stat, p_value = stats.ttest_1samp(df["sepal length (cm)"], 5.8)
t_stat, p_value

(np.float64(0.6409183514112012), np.float64(0.5225602746220779))

alpha = 0.05
if p_value < alpha:
    print("Reject H₀: Mean is significantly different")
else:
    print("Fail to reject H₀")

Fail to reject H₀

setosa = df[df["species"] == 0]["sepal length (cm)"]
versicolor = df[df["species"] == 1]["sepal length (cm)"]

t_stat, p_value = stats.ttest_ind(setosa, versicolor)
t_stat, p_value

(np.float64(-10.52098626754911), np.float64(8.985235037487079e-18))

setosa = df[df["species"] == 0]["sepal length (cm)"]
versicolor = df[df["species"] == 1]["sepal length (cm)"]
virginica = df[df["species"] == 2]["sepal length (cm)"]

f_stat, p_val = stats.f_oneway(setosa, versicolor, virginica)
f_stat, p_val

(np.float64(119.26450218450468), np.float64(1.6696691907693826e-31))

plt.figure(figsize=(8,5))
sns.boxplot(x="species", y="sepal length (cm)", data=df)
plt.title("Sepal Length Distribution by Species")
plt.show()

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Analysis Type	Method Used
Central tendency	Mean, Median, Mode
Dispersion	Variance, Std Dev, IQR
Distribution	Skewness, Kurtosis
Hypothesis testing	t-test, ANOVA
Visualization	Histogram, Boxplot

📘 Project Overview: Statistical Analysis Using Python¶

Objective of the Project¶

🔍 What This Project Covers¶

1️⃣ Exploratory Data Analysis (EDA)¶

2️⃣ Measures of Central Tendency¶

3️⃣ Measures of Dispersion¶

4️⃣ Understanding Data Distribution¶

5️⃣ Hypothesis Testing¶

a) One-sample t-test¶

b) Independent two-sample t-test¶

c) One-way ANOVA¶

6️⃣ Visualization for Statistical Insight¶

7️⃣ Why This Project Matters¶

8️⃣ Learning Outcomes¶

1. Import Required Libraries¶

📌 2. Load and Explore Dataset¶

📌 3. Measures of Central Tendency¶

Visualization¶

📌 4. Measures of Variability¶

📌 5. Distribution Shape¶

📌 6. Hypothesis Testing (One-Sample t-Test)¶

📌 7. Independent t-test (Two Groups)¶

📌 8. One-Way ANOVA¶

📌 9. Visualization: Boxplot Comparison¶

📌 10. Summary Table¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2