Source code for iMaT.src.tokenization.main

"""
Module: tokenization.main.py
============================

This module provides functions for tokenizing MIDI files in a selected directory. These files are converted into token sequences
using one or more tokenizers and saved as CSV files in a subdirectory of the original directory.

Functions
---------
- `tokenization_tokenize_folder_midi_files`: Tokenizes all MIDI files in a user-selected directory using selected tokenizers.
- `select_miditok_tokenizer`: Presents a user interface for tokenizer selection.
- `select_folder_midi_files`: Presents a user interface for folder selection.
- `tokenize_midi_file`: Tokenizes a MIDI file with a specified tokenizer class and saves the tokenized sequence to a CSV file.
- `extract_tokens_from_token_string_within_list`: Extracts tokens from a string representation of a list using regex.
- `extract_tokens_from_token_string_within_nested_list`: Extracts tokens from a string representation of a nested list using regex and abstract syntax trees (AST).

Notes
-----
Functions in this module can be used directly or through the CLI interface provided by the `cli.cli_menu_structure` module.
The tokenized sequences are saved as CSV files in a subdirectory of the original directory.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import ast
import datetime
import os
import re
import tkinter as tk
from tkinter import filedialog

import pandas as pd
from miditok import CPWord, MIDILike, MuMIDI, Octuple, OctupleMono, REMI, REMIPlus, Structured, TSD
from openpyxl.workbook import Workbook
from tqdm import tqdm

from iMaT.src.cli.menu_constructors import display_menu_print_textblock, \
    display_menu_request_selection
from iMaT.src.tokenization.utils import combine_csv_files_in_directory, create_log_entry, display_success_rate, \
    display_tokenizable_files_in_folder, \
    get_tokenizable_files_in_folder
from iMaT.src.utils.error_handling import handle_error

MIDITOK_TOKENIZERS_LIST = [
    ['REMI', REMI, '<One-Dimensional: Event-based, includes timing, bar info>'],
    ['REMIPlus', REMIPlus, '<One-Dimensional: Extended REMI, multi-track, multi-signature>'],
    ['MIDILike', MIDILike, '<One-Dimensional: Converts MIDI messages to tokens>'],
    ['TSD', TSD, '<One-Dimensional: Similar to MIDI-Like, uses explicit Duration tokens>'],
    ['Structured', Structured, '<One-Dimensional: Similar to TSD, consistent token type succession>'],
    ['CPWord', CPWord, '<Two-Dimensional: Uses embedding pooling to reduce sequence length>'],
    ['Octuple', Octuple, '<Two-Dimensional: Embedding pooling, single note representation>'],
    ['OctupleMono', OctupleMono, '<Two-Dimensional: Like Octuple, but suited for one track>'],
    ['MuMIDI', MuMIDI, '<Two-Dimensional: Multitrack tasks, uses embedding pooling>']
]

MIDITOK_TOKENIZERS_HEADERS = {
    'REMI': ['Tokens'],
    'REMIPlus': ['Tokens'],
    'MIDILike': ['Tokens'],
    'TSD': ['Tokens'],
    'Structured': ['Tokens'],
    'CPWord': ['Family', 'Position', 'Pitch', 'Velocity', 'Duration', 'Rest'],
    'Octuple': ['Pitch', 'Velocity', 'Duration', 'Program', 'Position', 'Bar'],
    'OctupleMono': ['Pitch', 'Velocity', 'Duration', 'Position', 'Bar'],
    'MuMIDI': ['Type*', 'BarPosEnc', 'PositionPosEnc', 'Velocity', 'Duration']
}

MIDITOK_TOKENIZERS_ADDITIONAL_TOKENS = {
    'Chord': False,
    'Program': False,
    'Rest': True,
    'Tempo': False,
    'TimeSignature': False,
    'chord_maps': {
        '7aug': (0, 4, 8, 11),
        '7dim': (0, 3, 6, 9),
        '7dom': (0, 4, 7, 10),
        '7halfdim': (0, 3, 6, 10),
        '7maj': (0, 4, 7, 11),
        '7min': (0, 3, 7, 10),
        '9maj': (0, 4, 7, 10, 14),
        '9min': (0, 4, 7, 10, 13),
        'aug': (0, 4, 8),
        'dim': (0, 3, 6),
        'maj': (0, 4, 7),
        'min': (0, 3, 7),
        'sus2': (0, 2, 7),
        'sus4': (0, 5, 7)
    },
    'chord_tokens_with_root_note': True,
    'chord_unknown': False,
    'nb_tempos': 32,
    'programs': list(range(-1, 128)),
    'rest_range': (2, 16),
    'tempo_range': (40, 250),
    'time_signature_range': (8, 2)
}


[docs]def tokenization_tokenize_folder_midi_files(): """ Tokenizes all MIDI files in a user-selected directory using the selected tokenizers. This function prompts the user to select a directory and one or more tokenizers. After selection, the function tokenizes all MIDI files in the directory using the chosen tokenizers. The tokenized sequences are saved as CSV files in a subdirectory of the original directory named "tokenized_YYYYMMDD_HHMMSS". Additionally, any tokenization results and errors are logged in an Excel file named "tokenization_log.xlsx" in the same subdirectory. Parameters ---------- None Returns ------- None See Also -------- select_folder_midi_files : Function to interact with user for selecting a folder. display_tokenizable_files_in_folder : Function to display tokenizable files. select_miditok_tokenizer : Function to interact with user for selecting tokenizer(s). """ folder_path = select_folder_midi_files() if folder_path is None: return tokenizable_files = display_tokenizable_files_in_folder(folder_path) selected_tokenizers = select_miditok_tokenizer() tokenization_dir = os.path.join(folder_path, "tokenized_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) os.mkdir(tokenization_dir) workbook = Workbook() workbook.save(tokenization_dir + r'\tokenization_log.xlsx') num_files_tokenized = len(tokenizable_files) * len(selected_tokenizers) num_tokenized = 0 files_status = [] for tokenizer_class in selected_tokenizers: # Look up the name of the tokenizer tokenizer_name = next((x[0] for x in MIDITOK_TOKENIZERS_LIST if x[1] == tokenizer_class), None) if not tokenizer_name: print(f"Error: Could not find tokenizer '{tokenizer_name}'") continue # Skip if the tokenizer name is not found if tokenizer_name not in ['Structured', 'REMIPlus']: tokenizer = tokenizer_class(additional_tokens=MIDITOK_TOKENIZERS_ADDITIONAL_TOKENS, beat_res={(0, 16): 16}) else: tokenizer = tokenizer_class() tokenized_file_folder = os.path.join(tokenization_dir, f"tokenizer_{tokenizer_name}") os.mkdir(tokenized_file_folder) for file in tqdm(tokenizable_files, ncols=70): # Wrap tokenizable_files with tqdm for progress bar try: midi_file_path = os.path.join(folder_path, file) tokens = tokenizer(midi_file_path) tokens_string = str(tokens) # check if the tokens are in a nested list if "[[" in tokens_string and "]]" in tokens_string: tokens_list = extract_tokens_from_token_string_within_nested_list(tokens_string) else: tokens_list = extract_tokens_from_token_string_within_list(tokens_string) # Convert the list of tokens to a DataFrame df = pd.DataFrame(tokens_list) # Get the column names for the current tokenizer headers = MIDITOK_TOKENIZERS_HEADERS.get(tokenizer_name, []) # Check if the DataFrame has the right number of columns if len(df.columns) == len(headers): df.columns = headers else: print( f'\nWarning: Number of columns in the DataFrame does not match the number of headers for ' f'tokenizer {tokenizer_name}. Defaulting to generic headers.\n') df.columns = ['column' + str(i) for i in range(1, len(df.columns) + 1)] # Save the DataFrame to a CSV file base_name = os.path.splitext(file)[0] # Get file base name without extension file_name_token = f"{base_name}_tokenizer_{tokenizer_name}_tokens.csv" file_path = os.path.join(tokenized_file_folder, file_name_token) df.to_csv(file_path, index=False) num_tokenized += 1 files_status.append([tokenizer_name, file, "<successfully converted>"]) except Exception as e: files_status.append([tokenizer_name, file, str(e)]) exception_list = [tokenizer_name, os.path.join(tokenized_file_folder, file), str(e)] create_log_entry(exception_list, tokenization_dir + r'\tokenization_log.xlsx') combine_csv_files_in_directory(tokenized_file_folder, f"00_combined_tokenizer_{tokenizer_name}_tokens.csv") display_success_rate(files_status, num_files_tokenized, num_tokenized)
[docs]def select_miditok_tokenizer(): """ Interacts with the user via a user interface for tokenizer selection. This function prompts the user to select one or more tokenizers from a predefined list. Depending on the user's choice, it returns the selected tokenizers. Parameters ---------- None Returns ------- list A list containing the selected tokenizer(s) classes. Raises ------ Exception If an error occurs while processing the user's selection. See Also -------- display_menu_request_selection : Function to display menu and request user selection. """ try: tokenizer_dict = { "menu_displayed_text": [ "File Tokenization: Tokenizer Selection", "Please select one of the following tokenizers by entering the corresponding index number:", "Which tokenizer do you want to select? (<No. of menu item>): ", ["Tokenizer", "Description"] ], "menu_entries": [[tokenizer[0], tokenizer[1], tokenizer[2]] for tokenizer in MIDITOK_TOKENIZERS_LIST] + [ ["All", "return_all", "Apply all tokenizers"]] } choice = display_menu_request_selection(tokenizer_dict) if choice == "return_all": return [tokenizer[1] for tokenizer in MIDITOK_TOKENIZERS_LIST] else: return [choice] except Exception as e: handle_error(e)
[docs]def select_folder_midi_files(): """ Interacts with the user via a user interface for folder selection. This function prompts the user to input the path to the desired folder. It validates the input and ensures the folder contains parsable music files. If not, it prompts the user to input another path. Parameters ---------- None Returns ------- str A string containing the path to the selected folder, or None if the user cancelled the dialog or didn't select a suitable folder. Raises ------ Exception If an error occurs while processing the user's selection. See Also -------- get_tokenizable_files_in_folder : Function to get tokenizable files in the selected folder. display_menu_print_textblock : Function to display menu. """ try: parsable_extensions = ['.midi', '.mid'] while True: textblock_dict = { "menu_displayed_text": [ "-- Folder Selection --", "Please read the following information:", "<To continue and select a folder, please press Enter>", ["", "Information"], ], "menu_entries_text": [ ["Parsable Music File Types", ", ".join(parsable_extensions)], ["Folder Requirements", "The folder must contain at least one parsable music file of the above types to be processed by this tool."], ["Return to Main Menu", "If you do not have a suitable folder with valid files at the moment, press enter and close the selection display that will be displayed right after.\n" "You will then be returned to the Main Menu."] ] } display_menu_print_textblock(textblock_dict) root = tk.Tk() # root.withdraw() # Hide the root window # Open a file dialog and get the selected file path folder_path = filedialog.askdirectory() root.destroy() # Destroy the root window if folder_path == "": # If the user canceled the dialog return None if len(get_tokenizable_files_in_folder(folder_path)) == 0: error_message_dict = { "menu_displayed_text": [ "File Conversion: Folder Selection - Error", "The selected directory contains no parsable music files:", "<To continue, please press Enter>", ["", "Troubleshooting assistance:"] ], "menu_entries_text": [ ["Parsable Music File Types", ", ".join(parsable_extensions)], ["Reason 1:", "There are no parsable music files in the selected directory."], ["Reason 2:", "Ensure the directory you select contains the music files you want to convert."], ["Return to Main Menu", "If you do not have a suitable folder with valid files at the moment, press enter and close the " "selection display that will be displayed right after.\n" "You will then be returned to the Main Menu."] ] } display_menu_print_textblock(error_message_dict) continue else: return folder_path except Exception as e: handle_error(e)
[docs]def tokenize_midi_file(midi_file_path, tokenizer_class, tokenized_file_folder): """ Tokenizes a MIDI file using a specified tokenizer class and saves the tokenized sequence to a CSV file. It handles both single-layer and nested token lists. Parameters ---------- midi_file_path : str Path to the MIDI file to be tokenized. tokenizer_class : class Class of the tokenizer to be used for tokenizing the MIDI file. tokenized_file_folder : str Path to the folder where the CSV file containing the tokenized sequence will be saved. Returns ------- None Raises ------ Exception If an error occurs during tokenization or saving the tokenized sequence. See Also -------- extract_tokens_from_token_string_within_list : Function to extract tokens from a list. extract_tokens_from_token_string_within_nested_list : Function to extract tokens from a nested list. """ try: # create the tokenizer and convert the MIDI to tokens tokenizer = tokenizer_class() tokens = tokenizer(midi_file_path) tokens_string = str(tokens) # check if the tokens are in a nested list if "[[" in tokens_string and "]]" in tokens_string: tokens_list = extract_tokens_from_token_string_within_nested_list(tokens_string) else: tokens_list = extract_tokens_from_token_string_within_list(tokens_string) # convert the list of tokens to a DataFrame df = pd.DataFrame(tokens_list) # save the DataFrame to a CSV file file_name_token = f"{tokenizer_class.__name__}_tokens.csv" file_path = os.path.join(tokenized_file_folder, file_name_token) df.to_csv(file_path, index=False, header=False) except Exception as e: handle_error(e)
[docs]def extract_tokens_from_token_string_within_list(tokens_string): """ Extracts tokens from a string representation of a list using regular expressions. This function is designed to process a string that represents a list of tokens. It utilizes regular expressions to accurately identify and extract all tokens within the string. The extracted tokens are returned as a list of strings. Parameters ---------- tokens_string : str String representation of a list of tokens. Expected to follow the pattern '= [<tokens>]'. Returns ------- list List of extracted tokens as strings. Raises ------ Exception If the provided tokens string is not formatted correctly or an error occurs during extraction. See Also -------- tokenize_midi_file : Function to tokenize MIDI files. """ try: # cut away the beginning of the string start_index = tokens_string.index('=[') + 2 trimmed_string = tokens_string[start_index:] # cut away everything after the first appearing "]" end_index = trimmed_string.index(']') trimmed_string = trimmed_string[:end_index] # extract strings using regular expression search_pattern = r"'([^']*)'" token_list = re.findall(search_pattern, trimmed_string) return token_list except Exception as e: handle_error(e)
[docs]def extract_tokens_from_token_string_within_nested_list(tokens_string): """ Extracts tokens from a string representation of a nested list using regular expressions and abstract syntax trees (AST). This function is designed to handle complex token string that represents a nested list of tokens. It uses a combination of regular expressions and abstract syntax trees to extract the tokens accurately from the string. The result is returned as a list of lists, where each sublist contains the tokens extracted from one of the nested lists in the string. Parameters ---------- tokens_string : str String representation of a nested list of tokens. Expected to follow the pattern '= [<nested lists>], ids='. Returns ------- list List of lists, with each sublist containing extracted tokens as strings. Raises ------ Exception If the provided tokens string is not formatted correctly or an error occurs during extraction. See Also -------- tokenize_midi_file : Function to tokenize MIDI files. """ try: # cut away everything before the string "[TokSequence(tokens=[" start_index = tokens_string.index('=[') + 2 trimmed_string = tokens_string[start_index:] # cut away everything after the string ", ids=" end_index = trimmed_string.index('], ids=') trimmed_string = trimmed_string[:end_index] search_pattern = r'\[([^\[\]]*?)\]' matched_lists = re.findall(search_pattern, trimmed_string) token_lists = [ast.literal_eval(selected_list) for selected_list in matched_lists] return token_lists except Exception as e: handle_error(e)