From 79690109f5006bfb144cf599ada1d6455e9d2fcd Mon Sep 17 00:00:00 2001 From: adisve Date: Wed, 10 Apr 2024 17:48:07 +0200 Subject: [PATCH] Move spinner init in load_data_to_hdfs.py to avoid error if /data is not populated --- scripts/spark/setup/load_data_to_hdfs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/spark/setup/load_data_to_hdfs.py b/scripts/spark/setup/load_data_to_hdfs.py index e32b97b..1483331 100644 --- a/scripts/spark/setup/load_data_to_hdfs.py +++ b/scripts/spark/setup/load_data_to_hdfs.py @@ -70,13 +70,14 @@ def get_schema(self): return self.parse_schema_file() def transfer_data(self): + spinner = Halo(text=f"Reading and writing data from /data/output.csv to {self.hdfs_path}") + spinner.start() try: self.start_spark_session() schema = self.get_schema() logging.info(f"Reading and writing data from /data/output.csv to {self.hdfs_path}") - spinner = Halo(text=f"Reading and writing data from /data/output.csv to {self.hdfs_path}") - spinner.start() + df = (self.spark.read.option("header", "true") .option("mode", "DROPMALFORMED") .option("overwrite", "true")