-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
104 lines (81 loc) · 3.32 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, zscore
# Load the dataset
df = pd.read_csv('insurance.csv')
# Exploratory Data Analysis
# Check the first few rows of the dataset
print(df.head())
# Get information about the dataset
print(df.info())
# Descriptive statistics of numerical columns
print(df.describe())
# Count the number of missing values in each column
print(df.isnull().sum())
# Identify unique values in categorical columns
categorical_cols = ['sex', 'smoker', 'region']
for col in categorical_cols:
unique_values = df[col].unique()
print(f'Unique values in {col}: {unique_values}')
# Data Preprocessing
# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'])
# Split the data into features and target variable
X = df.drop('charges', axis=1)
y = df['charges']
# Data Visualization
# Plot the distribution of charges
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='charges', kde=True)
plt.title('Distribution of Charges')
plt.show()
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# Scatter plot of age vs. charges
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='age', y='charges', hue='smoker_yes')
plt.title('Age vs. Charges')
plt.show()
# Box plot of smoker vs. charges
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='smoker_yes', y='charges')
plt.title('Smoker vs. Charges')
plt.show()
# Pair plot for multiple variables
sns.pairplot(df, vars=['age', 'bmi', 'children', 'charges'], hue='smoker_yes')
plt.title('Pair plot')
plt.show()
# Statistical tests and relationships
# Calculate average charges for smokers and non-smokers
avg_charges_smoker = df[df['smoker_yes'] == 1]['charges'].mean()
avg_charges_non_smoker = df[df['smoker_yes'] == 0]['charges'].mean()
print(f'Average charges for smokers: {avg_charges_smoker:.2f}')
print(f'Average charges for non-smokers: {avg_charges_non_smoker:.2f}')
# Perform t-test to compare charges for smokers and non-smokers
smoker_charges = df[df['smoker_yes'] == 1]['charges']
non_smoker_charges = df[df['smoker_yes'] == 0]['charges']
t_statistic, p_value = ttest_ind(smoker_charges, non_smoker_charges)
print(f'T-test: t-statistic = {t_statistic:.2f}, p-value = {p_value:.4f}')
# Calculate average charges by number of children
avg_charges_by_children = df.groupby('children')['charges'].mean()
print('Average charges by number of children:')
print(avg_charges_by_children)
# Calculate correlation coefficients
correlation_matrix = df.corr()
correlation_with_charges = correlation_matrix['charges'].sort_values(ascending=False)
print('Correlation with charges:')
print(correlation_with_charges)
# Identify outliers using z-scores
df['charges_zscore'] = zscore(df['charges'])
outliers = df[df['charges_zscore'].abs() > 3]
print('Outliers:')
print(outliers)
# Identify the age range with the highest charges
age_ranges = pd.cut(df['age'], bins=[0, 18, 35, 50, 65, 100])
charges_by_age_range = df.groupby(age_ranges)['charges'].mean()
highest_charges_age_range = charges_by_age_range.idxmax()
print('Age range with the highest charges:', highest_charges_age_range)