Skip to content

Commit

Permalink
Data Cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
manvith1604 committed Jun 7, 2024
1 parent f962866 commit 0925799
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 4 deletions.
Binary file modified data/processed_data/Transformed.xlsx
Binary file not shown.
Binary file added data/processed_data/filtered_data.pkl
Binary file not shown.
Binary file added data/processed_data/filtered_data.xlsx
Binary file not shown.
29 changes: 29 additions & 0 deletions data/raw_data/countries_to_drop.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Country
Andorra
Antigua and Barbuda
Aruba
Ecuador
Iraq
Kiribati
Kosovo
Macao SAR
Marshall Islands
Micronesia
Nauru
Palau
Papua New Guinea
Puerto Rico
Qatar
Republic of Congo
San Marino
Somalia
South Sudan
St. Kitts and Nevis
St. Lucia
St. Vincent and the Grenadines
Timor-Leste
Tuvalu
Uruguay
Venezuela
West Bank and Gaza
Zimbabwe
3 changes: 0 additions & 3 deletions src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@

# Initialize logger
file_path = os.path.join(PROJECT_DIR, 'utilities', 'log_config.json')
if not os.path.exists(file_path):
file_path = os.path.join(PROJECT_DIR, 'utilities', 'log_config.json')

my_logger = setup_logging(file_path)
my_logger.set_logger("main_logger")

Expand Down
3 changes: 3 additions & 0 deletions src/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import os
from data_loader import import_data
from data_cleaner import process_data
from filter_data import filter_data
from transform import transform_data

PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Defining the default paths for the Excel and pickle files
DEFAULT_EXCEL_PATH = os.path.join(PROJECT_DIR, 'data', 'raw_data', 'IMF_WEO_Data.xlsx')
DEFAULT_PICKLE_PATH = os.path.join(PROJECT_DIR, 'data', 'processed_data', 'raw_data.pkl')
DEFAULT_COUNTRIES_TO_DROP_PATH = os.path.join(PROJECT_DIR, 'data', 'raw_data', 'countries_to_drop.csv')

if __name__ == "__main__":
"""
Expand All @@ -20,4 +22,5 @@
LOAD_DATA = import_data(DEFAULT_EXCEL_PATH, DEFAULT_PICKLE_PATH)
CLEAN_DATA = process_data(LOAD_DATA)
TRANSFORM_DATA = transform_data(CLEAN_DATA)
FILTER_DATA = filter_data(TRANSFORM_DATA, DEFAULT_COUNTRIES_TO_DROP_PATH)

76 changes: 76 additions & 0 deletions src/filter_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import pandas as pd
from utilities.logger import setup_logging

# Define project directory
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Initialize logger
file_path = os.path.join(PROJECT_DIR, 'utilities', 'log_config.json')
my_logger = setup_logging(file_path)
my_logger.set_logger("main_logger")

def filter_data(pickle_path, countries_to_drop_path):
"""Function to filter data based on various criteria and save the final dataset"""
my_logger.write('info', f"Starting to filter data from {pickle_path}")

# Load the transformed data from pickle file
try:
df = pd.read_pickle(pickle_path)
my_logger.write('info', "Data successfully loaded from the transformed pickle file.")
except FileNotFoundError:
my_logger.write('error', "The transformed pickle file does not exist.")
raise FileNotFoundError("The transformed pickle file does not exist.")
except Exception as e:
my_logger.write('error', f"Failed to load data from the transformed pickle file: {e}")
raise

# Load the list of countries to drop from a CSV file
try:
countries_to_drop_df = pd.read_csv(countries_to_drop_path)
countries_to_drop = countries_to_drop_df['Country'].tolist()
my_logger.write('info', "Countries to drop successfully loaded.")
except FileNotFoundError:
my_logger.write('error', "The countries to drop CSV file does not exist.")
raise FileNotFoundError("The countries to drop CSV file does not exist.")
except Exception as e:
my_logger.write('error', f"Failed to load the countries to drop CSV file: {e}")
raise

# Ensure 'Year' column is numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Drop the specified countries
df = df[~df['Country'].isin(countries_to_drop)]

# Filter the DataFrame for years between 1994 and 2029 (inclusive)
filtered_df = df[(df['Year'] >= 1994) & (df['Year'] <= 2029)]

# Drop columns containing "percent of GDP" in their names
columns_to_drop = [col for col in filtered_df.columns if "percent of gdp" in col.lower()]
filtered_df = filtered_df.drop(columns=columns_to_drop)

# Drop additional specified columns
additional_columns_to_drop = [
'Output gap in percent of potential GDP - Percent of potential GDP (Units)',
'Employment - Persons (Millions)',
'General government net debt - National currency (Billions)',
'General government structural balance - National currency (Billions)',
'General government structural balance - Percent of potential GDP (Units)'
]
filtered_df = filtered_df.drop(columns=additional_columns_to_drop, errors='ignore')

# Save the filtered DataFrame to a new Pickle and Excel file
filtered_pickle_path = os.path.join(PROJECT_DIR, 'data', 'processed_data', 'filtered_data.pkl')
try:
filtered_df.to_pickle(filtered_pickle_path)
my_logger.write('info', f"Filtered data successfully saved to {filtered_pickle_path}")

filtered_excel_path = os.path.join(PROJECT_DIR, 'data', 'processed_data', 'filtered_data.xlsx')
filtered_df.to_excel(filtered_excel_path, index=False)
my_logger.write('info', f"Filtered data successfully saved as an Excel file at {filtered_excel_path}.")
except Exception as e:
my_logger.write('error', f"Failed to save the filtered data: {e}")
raise

return filtered_pickle_path
2 changes: 1 addition & 1 deletion src/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def transform_data(pickle_path):
my_logger.write('error', f"Failed to save transformed data: {e}")
raise

return transformed_excel_path
return transformed_pickle_path

def melt_dataframe(df):
"""Function to melt the DataFrame"""
Expand Down

0 comments on commit 0925799

Please sign in to comment.