eda (1).py

# -*- coding: utf-8 -*-
"""EDA.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1gjd-iDiSr0vCHaL0ox4emNbWFMIdhVbo

EDA
"""

# Step 1: Import Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Step 2: Reading dataset
df = pd.read_csv('/content/jobs_in_data.csv')
# Step 3: Data reduction
df = df.drop(['job_category', 'salary'], axis=1)
#Step 4: Feature engineering
df['new_feature1'] = df['salary_in_usd'] * df['work_year']
# Step 5: Creating features
df['new_feature1'] = df['salary_in_usd'] * df['work_year']
#Step 6: Data cleaning/wrangling
df = df.dropna()
df = df.drop_duplicates()
df = df[df['salary_in_usd'] > 0]
#EDA
sns.histplot(df['salary_in_usd'])
plt.show()
#statistic summary
print(df.describe())
# Step 9: EDA univariate analysis
sns.boxplot(x='salary_in_usd', data=df)
plt.show()
# Step 10: Data transformation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['salary_in_usd'] = scaler.fit_transform(df[['salary_in_usd']])
# Step 11: EDA bivariate analysis
sns.scatterplot(x='salary_in_usd', y='work_year', data=df)
plt.show()
# Step 12: EDA multivariate analysis
sns.pairplot(df)
plt.show()
# Step 13: Impute missing values (if needed)
df['salary_in_usd'] = df['salary_in_usd'].fillna(df['salary_in_usd'].mean())

"""Step 1: Import Python Libraries"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

"""Step 2: Reading Dataset"""

data = pd.read_csv('/content/jobs_in_data.csv')

"""Analayzing Data"""

data.head()

data.tail()

data.info() # data type,null or not null,number of records in each column,memory etc

data.nunique()  #to check duplicate of data

data.isnull().sum() #to calculate the missing values

(data.isnull().sum()/(len(data)))*100 #to calculate missing values in percentage

"""Step 3: Data Reduction"""

# Remove S.No. column from data
data = data.drop(['work_year'], axis = 1)
data.info()

"""Step 5: Creating Features"""

from datetime import date

# Assuming 'experience_level' is a string with words like 'Entry Level', 'Mid Level', etc.
level_mapping = {'Entry Level': 0, 'Mid Level': 5, 'Senior Level': 10}

# Map 'experience_level' to numerical values using the dictionary
data['Years_Experience'] = date.today().year - data['experience_level'].map(level_mapping)

data.head()

# Splitting the 'job_title' column to extract 'Department' and 'Role'
split_data = data['job_title'].str.split('-', 1, expand=True)

# Check if the split operation resulted in two columns
if len(split_data.columns) == 2:
    data[['Department', 'Role']] = split_data
else:
    # If not, set default values or handle it as needed
    data['Department'] = data['job_title']
    data['Role'] = None  # You can set a default value or handle it as needed

data[['job_title', 'Department', 'Role']]

"""Step 6: Data Cleaning/Wrangling"""

print(data.Department.unique())
print(data.Role.nunique())

data["Department"].replace({"Staff Machine Learning Engineer": " Data Scientist", "Staff Data Scientist": "Machine Learning Engineer"}, inplace=True)
print(data.Department.unique())
print(data.Role.nunique())

"""Step 8: Statistics Summary"""

data.describe().T

data.describe(include='all').T

#separate Numerical and categorical variables for easy analysis
cat_cols=data.select_dtypes(include=['object']).columns
num_cols = data.select_dtypes(include=np.number).columns.tolist()
print("Categorical Variables:")
print(cat_cols)
print("Numerical Variables:")
print(num_cols)

"""Step 9: EDA Univariate Analysis"""

import numpy as np

for col in num_cols:
    print(col)
    skewness = data[col].skew()

    # Check if skewness is NaN or if the column has only one unique value
    if np.isnan(skewness) or data[col].nunique() < 2:
        print(f'Skew : No meaningful skewness due to constant values')
    else:
        print('Skew :', round(skewness, 2))
        plt.figure(figsize=(15, 4))

        plt.subplot(1, 2, 1)
        data[col].hist(grid=False)
        plt.ylabel('count')

        plt.subplot(1, 2, 2)
        sns.boxplot(x=data[col].dropna())

        plt.show() #a histogram and box plot is used to show the pattern of the variables, as some variables have skewness and outliers.

import seaborn as sns
import matplotlib.pyplot as plt

# Define categorical variables in the job dataset
categorical_variables = ['job_title', 'job_category', 'salary_currency', 'employee_residence',
                           'experience_level', 'employment_type', 'work_setting',
                           'company_location', 'company_size',]

# Create subplots
fig, axes = plt.subplots(4, 3, figsize=(18, 18))
fig.suptitle('Bar plot for all categorical variables in the dataset')

# Flatten the axes for easier iteration
axes = axes.flatten()

# Visualize count plots for each categorical variable
for i, cat_var in enumerate(categorical_variables):
    sns.countplot(ax=axes[i], x=cat_var, data=data, color='blue',
                  order=data[cat_var].value_counts().index)
    axes[i].tick_params(labelrotation=45 if len(data[cat_var].unique()) > 5 else 0)

plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout to prevent clipping of titles
plt.show() #categorical variables are being visualized using a count plot.

"""Step 10: Data Transformation"""

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'salary_in_usd' and 'salary' are columns in your dataset
# Log transform the features
data['salary_in_usd_log'] = np.log1p(data['salary_in_usd'])
data['salary_log'] = np.log1p(data['salary'])

# Plot the log-transformed feature distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Log transformation of the attribute 'salary_in_usd'
sns.histplot(data['salary_in_usd_log'], ax=axes[0], kde=True)  # Use histplot instead of distplot
axes[0].set_title('Log Transformed salary_in_usd')
axes[0].set_xlabel('salary_in_usd_log')

# Log transformation of the attribute 'salary'
sns.histplot(data['salary_log'], ax=axes[1], kde=True)  # Use histplot instead of distplot
axes[1].set_title('Log Transformed Salary')
axes[1].set_xlabel('salary_log')

plt.show()

"""Step 12: EDA Bivariate Analysis"""

plt.figure(figsize=(13,17))
sns.pairplot(data=data.drop(['salary_in_usd','salary'],axis=1))
plt.show()

fig, axarr = plt.subplots(4, 2, figsize=(12, 18))
data.groupby('company_location')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][0], fontsize=12)
axarr[0][0].set_title("company_location Vs salary", fontsize=18)
data.groupby('job_category')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[0][1], fontsize=12)
axarr[0][1].set_title("job_category Vs salary", fontsize=18)
data.groupby('salary_currency')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][0], fontsize=12)
axarr[1][0].set_title("salary_currency Vs salary", fontsize=18)
data.groupby('employee_residence')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[1][1], fontsize=12)
axarr[1][1].set_title("employee_residence Vs salary", fontsize=18)
data.groupby('experience_level')['salary_log'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][0], fontsize=12)
axarr[2][0].set_title("experience_level Vs salary", fontsize=18)
data.groupby('employment_type')['salary_log'].mean().sort_values(ascending=False).head(10).plot.bar(ax=axarr[2][1], fontsize=12)
axarr[2][1].set_title("employment_type Vs salary", fontsize=18)
data.groupby('work_setting')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][0], fontsize=12)
axarr[3][0].set_title("work_setting Vs salary", fontsize=18)
data.groupby('company_size')['salary_log'].mean().sort_values(ascending=False).plot.bar(ax=axarr[3][1], fontsize=12)
axarr[3][1].set_title("company_size Vs salary", fontsize=18)
plt.subplots_adjust(hspace=1.0)
plt.subplots_adjust(wspace=.5)
sns.despine()#here bar plot is used to show the relationship between Categorical variables and continuous variables

"""Step 13: EDA Multivariate Analysis

"""

plt.figure(figsize=(12, 7))
sns.heatmap(data.drop(['salary_in_usd','salary'],axis=1).corr(), annot = True, vmin = -1, vmax = 1)
plt.show()#heat map shows the correlation between the variables.