Source code for iMaT.src.tokenization.utils

"""
Module: tokenization.utils.py
=============================

This module provides various utility functions that assist in the tokenization of MIDI files and handling of data.

Functions
---------
- `save_data_to_new_csv_file`: Saves a DataFrame to a new CSV file in a timestamped directory.
- `select_csv_file_2d_token_representation`: Opens a dialog for the user to select a CSV file.
- `display_tokenizable_files_in_folder`: Prints all the tokenizable files in a specific folder.
- `get_tokenizable_files_in_folder`: Retrieves all tokenizable MIDI files in a specific folder.
- `display_success_rate`: Shows the success rate of the tokenization process and any errors.
- `combine_csv_files_in_directory`: Merges all CSV files in a specific directory into one CSV file.
- `create_log_entry`: Adds a new entry to an existing Excel file, or creates a new one if it doesn't exist.

Notes
-----
These functions are used throughout the package to facilitate the tokenization process for MIDI files and ensure correct data handling, saving, and logging.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import glob
import os
import tkinter as tk
from datetime import datetime
from tkinter import filedialog

import pandas as pd
from openpyxl import load_workbook
from tqdm import tqdm

from iMaT.src.cli.menu_constructors import display_menu_print_results, display_menu_print_textblock
from iMaT.src.utils.error_handling import handle_error


[docs]def save_data_to_new_csv_file(df, file_name, identifier): """ Saves the refined DataFrame to a new CSV file in a directory named 'enhanced_csv_[current date and time]'. This function extracts the directory path of the provided file, then creates a new directory in that path with the provided identifier and current timestamp. The DataFrame is then saved to this new directory with the identifier prepended to the original filename. Parameters ---------- df : pandas.DataFrame The DataFrame to be saved. file_name : str The original file name used to generate the new file name. identifier : str A string to be used as an identifier for the new directory and file name. Returns ------- str The path to the new CSV file. Raises ------ Exception If an error occurs during directory creation or file saving. """ try: folder_path = os.path.dirname(file_name) # Get the directory path of the file cleaned_csv_dir = os.path.join(folder_path, identifier + datetime.now().strftime("%Y%m%d_%H%M%S")) # Create new directory if it does not exist if not os.path.exists(cleaned_csv_dir): os.makedirs(cleaned_csv_dir) # Step 6: save the new file in the new directory new_file_path = os.path.join(cleaned_csv_dir, identifier + os.path.basename(file_name)) df.to_csv(new_file_path, index=False) return new_file_path except Exception as e: handle_error(e)
[docs]def select_csv_file_2d_token_representation(): """ Opens a file dialog allowing the user to select a CSV file. This function guides the user to select a suitable CSV file for further processing. It makes use of a graphical file dialog and informs the user about the requirements for the file selection via terminal outputs. Returns ------- str The path to the selected file if a file was selected. None None if the user canceled the dialog. Raises ------ Exception If an error occurs during file selection. """ try: while True: textblock_dict_tokenizer = { "menu_displayed_text": [ "-- Tokenized File Selection --", "Please read the following information:", "<To continue and select a file, please press Enter>", ["", "Information"], ], "menu_entries_text": [ ["Valid Files", "The selected file must be processed by any of the following tokenizers in order to be processed by this tool:"], ["Tokenizers:", "CPWord, Octuple, OctupleMono, MuMIDI (Two-Dimensional Tokenizers)"], ["File Requirements", "The filename must contain one of these strings: 'CPWord', 'Octuple', 'OctupleMono', 'MuMIDI'.\n" "Please select a file that matches this requirement."], ["Return to Main Menu", "If you do not have a suitable file at the moment, press enter and close the selection display that will be displayed right after.\n" "You will then be returned to the Main Menu."] ] } display_menu_print_textblock(textblock_dict_tokenizer) root = tk.Tk() # root.withdraw() # Hide the root window # Open a file dialog and get the selected file path file_path = filedialog.askopenfilename(filetypes=[("CSV Files", "*.csv")]) root.destroy() # Destroy the root window # Verifying the selected file name base_file_name = os.path.basename(file_path) # Get the file name without the directory path textblock_dict_tokenizer = { "menu_displayed_text": [ "-- Tokenizer Warning --", "Please read the following warning:", "<To continue, please press Enter>", ["", "Warning"], ], "menu_entries_text": [ ["Tokenizer Warning", "The selected file does not contain the name of any of the valid tokenizers.\n" "Please select a file that matches one of the required patterns and includes a valid tokenizer name."] ] } if file_path == "": # If the user canceled the dialog return None tokenizers_available_for_refining = ['CPWord', 'Octuple', 'OctupleMono', 'MuMIDI'] if not any(tokenizer in base_file_name for tokenizer in tokenizers_available_for_refining): display_menu_print_textblock(textblock_dict_tokenizer) continue if file_path: # If a file was selected return file_path except Exception as e: handle_error(e)
[docs]def display_tokenizable_files_in_folder(folder_path): """ Displays the tokenizable files in the provided directory. The function uses the 'get_tokenizable_files_in_folder' function to fetch all tokenizable files from the directory. Then, it displays the files to the user. If more than 30 files exist in the directory, only the first 30 are displayed, along with a note indicating more files are not shown. Parameters ---------- folder_path : str The path of the directory to search for tokenizable files. Returns ------- list A list of tokenizable file names. Raises ------ Exception If an error occurs during the process. """ try: while True: tokenizable_files = get_tokenizable_files_in_folder(folder_path) tokenizable_files_display = tokenizable_files[:30] # Only take the first 30 files for display if len(tokenizable_files) > 30: # If there are more than 30 files, add an indicator at the end tokenizable_files_display.append("... (more files not shown)") music_files_dict = { "menu_displayed_text": [f"File Tokenization: Found MIDI files ({len(tokenizable_files)} files found)", f"In Folder: '{folder_path}'", "<To continue, please press Enter (enter 'r' to refresh)> ", ["File Path"]], "menu_entries_results": [[file] for file in tokenizable_files_display] } refresh_choice = display_menu_print_results(music_files_dict) if refresh_choice.lower() != 'r': break return tokenizable_files except Exception as e: handle_error(e)
[docs]def get_tokenizable_files_in_folder(folder_path): """ Retrieves all tokenizable MIDI files in the given folder. This function inspects the given directory and identifies all files with the extensions '.midi' or '.mid', which are considered tokenizable. Parameters ---------- folder_path : str The path of the folder to be searched. Returns ------- list A list of filenames that can be tokenized. Raises ------ Exception If an error occurs during the process. """ try: tokenizable_extensions = ['.midi', '.mid'] tokenizable_files = [f for f in os.listdir(folder_path) if os.path.splitext(f)[1].lower() in tokenizable_extensions] return tokenizable_files except Exception as e: handle_error(e)
[docs]def display_success_rate(files_status, num_files_tokenized, num_tokenized): """ Displays the tokenization success rate and details of failed tokenizations. This function calculates and displays the success rate of file tokenization. It also identifies and displays any failed tokenizations, including details of the associated tokenizer and file. Parameters ---------- files_status : list A list of status messages from the file tokenization process. num_files_tokenized : int The total number of files that were attempted to be tokenized. num_tokenized : int The total number of files that were successfully tokenized. Returns ------- None Raises ------ Exception If an error occurs during the process. """ try: # Create text_dict with both tokenization success rate and failed files failed_files_status = [status for status in files_status if "<successfully converted>" not in status] if failed_files_status: # If there are failed tokenizations # Filter out only the failed tokenizations and create text_dict with failed files only text_dict = { "menu_displayed_text": [ "File Tokenization: Tokenization Summary", f"Conversion Success Rate {num_tokenized}/{num_files_tokenized} ({(num_tokenized / num_files_tokenized) * 100:.2f}%)", "<To continue, please press Enter>", ["Tokenizer name", "File name", "Details"] ], "menu_entries_results": [ [str(tokenizer_name), str(file), f"'{str(status)}'"] for tokenizer_name, file, status in failed_files_status] } else: # If there are no failed tokenizations # Show the first 30 successful tokenizations, or all if less than 30 num_successful_files = min(30, len(files_status)) successful_files_status = files_status[:num_successful_files] successful_files_display = successful_files_status if len(files_status) > 30: # If there are more than 30 files, add an indicator at the end successful_files_display.append(["...", "...", "<more files not shown>"]) text_dict = { "menu_displayed_text": [ "File Tokenization: Tokenization Summary", f"Conversion Success Rate {num_tokenized}/{num_files_tokenized} ({(num_tokenized / num_files_tokenized) * 100:.2f}%)", "<To continue, please press Enter>", ["Tokenizer name", "File name", "Details"] ], "menu_entries_results": [ [str(tokenizer_name), str(file), f"'{str(status)}'"] for tokenizer_name, file, status in successful_files_display] } display_menu_print_results(text_dict) except Exception as e: handle_error(e)
[docs]def combine_csv_files_in_directory(directory_path, output_file_name): """ Combines all CSV files in a given directory into a single CSV file. This function reads all CSV files in the provided directory, extracts the original filenames from these files, and adds these as a new column in the DataFrames. The DataFrames are then concatenated and saved as a new CSV file in the same directory. Parameters ---------- directory_path : str The path to the directory containing the CSV files. output_file_name : str The name of the combined output CSV file. Returns ------- None Raises ------ Exception If an error occurs during the process. """ try: # Get a list of all CSV files in the directory csv_files = glob.glob(os.path.join(directory_path, "*.csv")) # Initialize an empty list to hold DataFrames df_list = [] # Loop through each CSV file for csv_file in tqdm(csv_files, ncols=70): # Wrap csv_files with tqdm for progress bar # Read the CSV file into a DataFrame df = pd.read_csv(csv_file) # Extract the original file name from the CSV file name original_file_name = os.path.basename(csv_file).split('_tokenizer')[0] # Assuming file names are in the format "<original_name>_tokenizer_<tokenizer_name>_tokens.csv" # Insert it into a new column at the start of the DataFrame df.insert(0, 'filename', original_file_name) # Append the DataFrame to the list df_list.append(df) # Concatenate all DataFrames in the list combined_df = pd.concat(df_list, ignore_index=True) # Save the combined DataFrame to a CSV file combined_df.to_csv(os.path.join(directory_path, output_file_name), index=False) except Exception as e: handle_error(e)
[docs]def create_log_entry(log_list, log_list_path): """ Appends a new entry to an existing .xlsx file or creates the file if it doesn't exist. The function uses the openpyxl library to open or create a .xlsx file at the provided path. It then appends the provided log_list as a new row to the first sheet of the workbook. Parameters ---------- log_list : list A list of strings to be appended as a new row. log_list_path : str The path to the .xlsx file. Returns ------- None Raises ------ Exception If an error occurs during the process. """ try: wb = load_workbook(log_list_path) ws = wb.active ws.append(log_list) wb.save(log_list_path) except Exception as e: handle_error(e)