From a44356dc6742c8497d8ee877946d4d5f115c9197 Mon Sep 17 00:00:00 2001 From: Rostyslav Didenko Date: Wed, 20 Nov 2024 15:12:52 +0200 Subject: [PATCH 1/3] fix: updated rtf_file_to_pdf for better error handling during the 'no-metadata' combine. --- src/pdf_util.py | 89 ++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/src/pdf_util.py b/src/pdf_util.py index a044413..91576cc 100644 --- a/src/pdf_util.py +++ b/src/pdf_util.py @@ -224,54 +224,75 @@ def meta_data_to_dict(meta_data_file, title_sep: str, add_popul: bool = True): return dict(zip(output_id, output_title)) - @staticmethod - def close_word_proc(proc_tuple=("word", "winword", "WINWORD", "splwow64.exe"), silent=False): + def close_word_proc(self, proc_tuple=("word", "winword", "WINWORD", "splwow64.exe"), silent=False): + """ + Check if word application or print service are running and kill them """ - Check if word application or print service are running and kill them to avoid freeze tool - :param proc_tuple: list of process to check, default values word and print service - :return: None - """ for proc in psutil.process_iter(): if any(procstr in proc.name() for procstr in proc_tuple): if not silent: - result = messagebox.askquestion(title="Word process running", - message='All Word related processes should be closed before run.' + \ - "\nClose all Word processes?") - - if result or silent: + result = messagebox.askquestion( + title="Word process running", + message='All Word related processes should be closed before run.\nClose all Word processes?' + ) + if result == 'yes': + proc.kill() + else: + # In silent mode, kill process without asking proc.kill() # TODO: TO_THINK: run with multithreads, parallelization? def rtf_file_to_pdf(self, file_name: str, input_dir: str, output_dir: str, pause_time: float) -> None: + """Convert RTF to PDF with improved process handling""" word = None - wdFormatPDF = 17 - wdDoNotSaveChanges = 0 - - word = win32com.client.gencache.EnsureDispatch('Word.Application') + max_retries = 3 + retry_count = 0 + + while retry_count < max_retries: + try: + word = win32com.client.gencache.EnsureDispatch('Word.Application') + word.Visible = False + + in_file = os.path.normpath(os.path.join(input_dir, file_name)) + output_file = os.path.splitext(file_name)[0] + out_file = os.path.normpath(os.path.join(output_dir, output_file + '.pdf')) + + if os.path.isfile(out_file): + self.gui.logger.warning(f'{file_name} already exists as PDF') + return + + doc = word.Documents.Open(in_file, False, False, True) + doc.SaveAs(out_file, FileFormat=17) # wdFormatPDF = 17 + doc.Close(SaveChanges=0) # wdDoNotSaveChanges = 0 + time.sleep(pause_time) + self.gui.logger.warning(f'{file_name} has been converted to PDF') + return + + except Exception as e: + retry_count += 1 + self.gui.logger.warning(f'Attempt {retry_count} failed for {file_name}: {str(e)}') + try: + if doc: + doc.Close(SaveChanges=0) + except: + pass - in_file = os.path.normpath(os.path.join(input_dir, file_name)) - output_file = os.path.splitext(file_name)[0] - out_file = os.path.normpath(os.path.join(output_dir, output_file + '.pdf')) + finally: + try: + if word: + word.Quit() + except: + pass - if os.path.isfile(out_file): - self.gui.logger.warning(f'{file_name} already exists as PDF') - return + # Force cleanup of any hanging processes + self.close_word_proc(silent=True) - try: - doc = word.Documents.Open(in_file, False, False, True) - doc.SaveAs(out_file, FileFormat=wdFormatPDF) - doc.Close(SaveChanges=wdDoNotSaveChanges) - time.sleep(pause_time) - self.gui.logger.warning(f'{file_name} has been converted to PDF') - except Exception as e: - self.gui.logger.error(f'ERROR: Error while converting {file_name}: {str(e)}') + if retry_count >= max_retries: + error_msg = f'Failed to convert {file_name} after {max_retries} attempts' + self.gui.logger.error(error_msg) if self.gui.final_run_var.get() == 1: - messagebox.showerror('File convert error', - f'ERROR: Error while converting {file_name}. Check metadata file and TLF file.') + messagebox.showerror('File convert error', error_msg) os.abort() - finally: - if word: - word.Quit() def convert_to_pdf(self, in_list: list, rtf_folder_dir: str, pdf_folder_dir: str) -> None: """ From df8e326e42baa47f1dccd9a9d669ca28a1050a67 Mon Sep 17 00:00:00 2001 From: Rostyslav Didenko Date: Wed, 20 Nov 2024 15:24:28 +0200 Subject: [PATCH 2/3] fix: added check if the process can be killed in the first place. --- src/pdf_util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/pdf_util.py b/src/pdf_util.py index 91576cc..64931d5 100644 --- a/src/pdf_util.py +++ b/src/pdf_util.py @@ -236,10 +236,16 @@ def close_word_proc(self, proc_tuple=("word", "winword", "WINWORD", "splwow64.ex message='All Word related processes should be closed before run.\nClose all Word processes?' ) if result == 'yes': - proc.kill() + try: + proc.kill() + except: + pass else: # In silent mode, kill process without asking - proc.kill() + try: + proc.kill() + except: + pass # TODO: TO_THINK: run with multithreads, parallelization? def rtf_file_to_pdf(self, file_name: str, input_dir: str, output_dir: str, pause_time: float) -> None: From 35641ee06a8fdf5b2e5b6e27d271406d70aff1ac Mon Sep 17 00:00:00 2001 From: Rostyslav Didenko Date: Wed, 20 Nov 2024 17:09:16 +0200 Subject: [PATCH 3/3] fix: fixed ToC and bookmark appearance issue. --- src/pdf_compiler.py | 22 +++-- src/pdf_util.py | 233 ++++++++++++++++---------------------------- 2 files changed, 100 insertions(+), 155 deletions(-) diff --git a/src/pdf_compiler.py b/src/pdf_compiler.py index 758cf06..0601254 100644 --- a/src/pdf_compiler.py +++ b/src/pdf_compiler.py @@ -139,7 +139,7 @@ def combine_pdfs(self): self.gui.root.update() # Combine PDFs - self.util.go_combine_selected_pdf( + combined = self.util.go_combine_selected_pdf( dir=self.pathToPDF, meta_data_=metadata_path, out_name=self.gui.OUTPUT_FILENAME, @@ -147,13 +147,23 @@ def combine_pdfs(self): title_sep=self.gui.title_separator, add_popul=self.gui.add_population ) - current_progress += 1 - self.gui.pb1['value'] = current_progress - self.gui.root.update() + + if combined: + current_progress += 1 + self.gui.pb1['value'] = current_progress + self.gui.root.update() + + # Only add TOC if files were actually combined + self.add_toc() + + pdfs_count = len([f for f in os.listdir(self.pathToPDF) if f.endswith('.pdf')]) self.gui.logger.warning( - '\nINFO: Job finished! ' + str(tlfs_count) + ' files were added to ' + self.gui.OUTPUT_FILENAME) - self.gui.logger.warning('\nINFO: ' + self.gui.OUTPUT_FILENAME + ' is saved in ' + str(os.getcwd())) + f'\nINFO: Job finished! {tlfs_count} files were processed and ' + f'{pdfs_count} were added to {self.gui.OUTPUT_FILENAME}') + + if pdfs_count > 0: + self.gui.logger.warning('\nINFO: ' + self.gui.OUTPUT_FILENAME + ' is saved in ' + str(os.getcwd())) # Final progress update self.gui.pb1['value'] = total_steps diff --git a/src/pdf_util.py b/src/pdf_util.py index 64931d5..7e3b756 100644 --- a/src/pdf_util.py +++ b/src/pdf_util.py @@ -335,180 +335,115 @@ def add_bmk_to_file(self, input_dir: str, meta_data_file: str, title_sep: str, a df['Filename'] = df['OutputName'].str.replace('-', '_') df['Filename'] = df['Filename'].str.replace('.', '_') df['Filename'] = df['Filename'] + ".rtf" - df['FilenamePDF'] = input_dir + '\\' + df.Filename.str.slice(0, -4) + '.pdf' + df['FilenamePDF'] = df['Filename'].str[:-4] + '.pdf' + df['FilenamePDF'] = df['FilenamePDF'].apply(lambda x: os.path.join(input_dir, x)) + if add_popul: df['Bookmark'] = df['Title3'] + str(title_sep) + df['Title4'] + str(title_sep) + df['Title5'] else: df['Bookmark'] = df['Title3'] + str(title_sep) + df['Title4'] - file_bmk_dict = dict(zip(df.FilenamePDF, df.Bookmark)) + # Filter to only existing files + existing_files = df[df['FilenamePDF'].apply(os.path.exists)] - for file, bmk_txt in file_bmk_dict.items(): - if self.gui.final_run_var.get(): - self.gui.logger.warning("Add bookmark to file " + str(file)) - self.gui.logger.warning("Bookmark to add: " + str(bmk_txt)) - try: - # Create temporary filename - temp_file = file + ".tmp" - - # Open original document - doc = fitz.open(file) - # Create new document - new_doc = fitz.open() - # Copy pages from original - new_doc.insert_pdf(doc) - # Set TOC - new_doc.set_toc([[1, bmk_txt, 1]]) - # Save to temporary file - new_doc.save(temp_file, garbage=4, deflate=True) - new_doc.close() - doc.close() - - # Remove original and rename temp - try: - os.replace(temp_file, file) - except PermissionError: - # If direct replace fails, try alternative approach - os.remove(file) - os.rename(temp_file, file) - - except Exception as e: - self.gui.logger.error(f"Error processing file {file}: {str(e)}") - # Try to clean up temp file if it exists - if os.path.exists(temp_file): - try: - os.remove(temp_file) - except: - pass + if existing_files.empty: + self.gui.logger.warning("No PDF files found matching metadata entries") + return - else: # temp run - if os.path.exists(file): - try: - # Same process as above for existing files - temp_file = file + ".tmp" - doc = fitz.open(file) - new_doc = fitz.open() - new_doc.insert_pdf(doc) - new_doc.set_toc([[1, bmk_txt, 1]]) - new_doc.save(temp_file, garbage=4, deflate=True) - new_doc.close() - doc.close() + # Process each file individually + for _, row in existing_files.iterrows(): + file = row['FilenamePDF'] + bmk_txt = row['Bookmark'] - try: - os.replace(temp_file, file) - except PermissionError: - os.remove(file) - os.rename(temp_file, file) - - except Exception as e: - self.gui.logger.error(f"Error processing file {file}: {str(e)}") - if os.path.exists(temp_file): - try: - os.remove(temp_file) - except: - pass + self.gui.logger.warning("Add bookmark to file " + str(file)) + self.gui.logger.warning("Bookmark to add: " + str(bmk_txt)) - else: - self.gui.logger.warning("Create file: " + str(file)) - bmk_txt = str(os.path.basename(file))[:-4] + "NO SUCH FILE IN TLF's FOLDER->Re-RUN to get bookmark" - self.gui.logger.warning("Bookmark to add_: " + str(bmk_txt)) + try: + temp_file = file + ".tmp" + doc = fitz.open(file) + new_doc = fitz.open() + new_doc.insert_pdf(doc) + new_doc.set_toc([[1, bmk_txt, 1]]) + new_doc.save(temp_file, garbage=4, deflate=True) + new_doc.close() + doc.close() + + try: + os.replace(temp_file, file) + except PermissionError: + os.remove(file) + os.rename(temp_file, file) + except Exception as e: + self.gui.logger.error(f"Error processing file {file}: {str(e)}") + if os.path.exists(temp_file): try: - doc = fitz.open() - page = doc.new_page() - page.insert_text(fitz.Point(50, 100), """NO SUCH FILE IN TLF's FOLDER""", fontsize=35) - doc.set_toc([[1, bmk_txt, 1]]) - doc.save(file, garbage=4, deflate=True) - doc.close() - except Exception as e: - self.gui.logger.error(f"Error creating placeholder file {file}: {str(e)}") + os.remove(temp_file) + except: + pass def go_combine_selected_pdf(self, dir, meta_data_, out_name, title_sep: str, add_popul: bool = True, - prot_fl: bool =False): - - def fitz_combine(pdf_files, out_name_, prot_fl_=False): - with fitz.open() as result: - general_toc, tmp_toc = None, None - - for pdf in pdf_files: - with fitz.open(pdf) as mfile: - pages = len(result) - result.insert_pdf(mfile) - if not general_toc: - general_toc = mfile.get_toc(simple=True) - else: - tmp_toc = mfile.get_toc(simple=True) - for t in tmp_toc: # increase toc2 page numbers - t[2] += pages # by old len(doc1) - general_toc += tmp_toc - - # general_toc.sort() - general_toc = list(k for k, _ in itertools.groupby(general_toc)) - result.set_toc(general_toc) - if prot_fl_: - result.save(out_name_, pretty=True, garbage=4, deflate=True, encryption=4, user_pw="ewq321") - else: - result.save(out_name_, pretty=True, garbage=4, deflate=True) - - - # TODO: move to fucn (Dataframe to order list) + prot_fl: bool = False): df = pd.read_csv(meta_data_) df = df.dropna(how='all') df['Filename'] = df['OutputName'].str.replace('-', '_') df['Filename'] = df['Filename'].str.replace('.', '_') df['Filename'] = df['Filename'] + ".rtf" + df['FilenamePDF'] = df['Filename'].str[:-4] + '.pdf' + existing_files = df[df['FilenamePDF'].apply(lambda x: os.path.exists(os.path.join(dir, x)))] + + if existing_files.empty: + self.gui.logger.warning("No PDF files found to combine") + return False + if add_popul: - df['Bookmark'] = df['Title3'] + str(title_sep) + df['Title4'] + str(title_sep) + df['Title5'] + existing_files.loc[:, 'Bookmark'] = existing_files['Title3'] + str(title_sep) + existing_files[ + 'Title4'] + str(title_sep) + existing_files['Title5'] else: - df['Bookmark'] = df['Title3'] + str(title_sep) + df['Title4'] - - tmp_file_order_dict = dict(zip(df['Order'], df['Filename'])) - tmp_file_bookmark_dict = dict(zip(df['Filename'], df['Bookmark'])) - tmp_file_order_dict = {k: v[:-3] + 'pdf' for k, v in tmp_file_order_dict.items()} - tmp_file_bookmark_dict = {k[:-3] + 'pdf': v for k, v in tmp_file_bookmark_dict.items()} + existing_files.loc[:, 'Bookmark'] = existing_files['Title3'] + str(title_sep) + existing_files['Title4'] - order_dict = dict(sorted(tmp_file_order_dict.items())) + order_dict = dict(zip(existing_files['Order'], existing_files['FilenamePDF'])) pdf_files_ = tuple(os.path.join(dir, v) for k, v in dict(sorted(order_dict.items())).items()) - FILE_CONST = 400 - if len(pdf_files_) <= FILE_CONST: - fitz_combine(pdf_files=pdf_files_, out_name_=out_name, prot_fl_=prot_fl) + if pdf_files_: + # Use self.gui.CWD to get the correct working directory + out_path = os.path.join(dir, '..', out_name) + self._fitz_combine(pdf_files_, out_path, prot_fl) + self.gui.logger.warning(f'\nINFO: Job finished! {len(pdf_files_)} files were combined into {out_name}') + self.gui.logger.warning(f'\nINFO: {out_name} is saved in {os.path.dirname(out_path)}') + return True + return False + + def _fitz_combine(self, pdf_files_, output_name, prot_fl=False): + """Combine PDFs using PyMuPDF (fitz)""" + result = fitz.open() + general_toc = [] + current_page = 1 + + for pdf in pdf_files_: + with fitz.open(pdf) as mfile: + pages = len(result) + result.insert_pdf(mfile) + tmp_toc = mfile.get_toc(simple=True) + if tmp_toc: + for t in tmp_toc: + t[2] += pages + general_toc.extend(tmp_toc) + + if general_toc: + result.set_toc(general_toc) + + if self.gui.pas_check_var.get() and self.gui.entry_var5.get(): + result.save(output_name, + encryption=fitz.PDF_ENCRYPT_AES_256, + owner_pw=self.gui.entry_var5.get(), + garbage=4, + deflate=True) else: - a_ = math.ceil(int(len(pdf_files_))/FILE_CONST) - a_view = order_dict.items() - a_list = list(a_view) - st = 0 - end = FILE_CONST - dct_lst = [] - for i in range(0, a_): - dct_lst.append(f'a_{i}') - exec("a_{} = a_list[st:end]".format(i)) - st = end - end += FILE_CONST - - - for elem in dct_lst: - dict_1 = dict() - exec("""for num, f_name in {0}: - dict_1.setdefault(num, []).append(f_name)""".format(elem)) - for key, value in dict_1.items(): - dict_1[key] = value[0] - - - - pdf_files_ = tuple(os.path.join(dir, v) for k, v in dict(sorted(dict_1.items())).items()) - fitz_combine(pdf_files=pdf_files_, out_name_=elem+".pdf", prot_fl_=prot_fl) - - dct_lst = [elem+'.pdf' for elem in dct_lst] - fitz_combine(pdf_files=dct_lst, out_name_=out_name, prot_fl_=prot_fl) - for elem in dct_lst: - try: - os.remove(elem) - except Exception as e: - print(e) - - + result.save(output_name, garbage=4, deflate=True) + result.close() + return True class ProgressHandler(logging.Handler):